Skip to content

Commit 78739fd

Browse files
committed
[DAG] Enable combineShiftOfShiftedLogic folds after type legalization
This was disabled to prevent regressions, which appear to be just occurring on AMDGPU (at least in our current lit tests), which I've addressed by adding AMDGPUTargetLowering::isDesirableToCommuteWithShift overrides. Fixes #57872 Differential Revision: https://reviews.llvm.org/D136042
1 parent b1a6c6c commit 78739fd

25 files changed

+982
-793
lines changed

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -8877,13 +8877,9 @@ SDValue DAGCombiner::visitShiftByConstant(SDNode *N) {
88778877
if (!LHS.hasOneUse() || !TLI.isDesirableToCommuteWithShift(N, Level))
88788878
return SDValue();
88798879

8880-
// TODO: This is limited to early combining because it may reveal regressions
8881-
// otherwise. But since we just checked a target hook to see if this is
8882-
// desirable, that should have filtered out cases where this interferes
8883-
// with some other pattern matching.
8884-
if (!LegalTypes)
8885-
if (SDValue R = combineShiftOfShiftedLogic(N, DAG))
8886-
return R;
8880+
// Fold shift(bitop(shift(x,c1),y), c2) -> bitop(shift(x,c1+c2),shift(y,c2)).
8881+
if (SDValue R = combineShiftOfShiftedLogic(N, DAG))
8882+
return R;
88878883

88888884
// We want to pull some binops through shifts, so that we have (and (shift))
88898885
// instead of (shift (and)), likewise for add, or, xor, etc. This sort of

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -839,6 +839,39 @@ bool AMDGPUTargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const {
839839
return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32;
840840
}
841841

842+
bool AMDGPUTargetLowering::isDesirableToCommuteWithShift(
843+
const SDNode* N, CombineLevel Level) const {
844+
assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA ||
845+
N->getOpcode() == ISD::SRL) &&
846+
"Expected shift op");
847+
// Always commute pre-type legalization and right shifts.
848+
// We're looking for shl(or(x,y),z) patterns.
849+
if (Level < CombineLevel::AfterLegalizeTypes ||
850+
N->getOpcode() != ISD::SHL || N->getOperand(0).getOpcode() != ISD::OR)
851+
return true;
852+
853+
// If only user is a i32 right-shift, then don't destroy a BFE pattern.
854+
if (N->getValueType(0) == MVT::i32 && N->use_size() == 1 &&
855+
(N->use_begin()->getOpcode() == ISD::SRA ||
856+
N->use_begin()->getOpcode() == ISD::SRL))
857+
return false;
858+
859+
// Don't destroy or(shl(load_zext(),c), load_zext()) patterns.
860+
auto IsShiftAndLoad = [](SDValue LHS, SDValue RHS) {
861+
if (LHS.getOpcode() != ISD::SHL)
862+
return false;
863+
auto *RHSLd = dyn_cast<LoadSDNode>(RHS);
864+
auto *LHS0 = dyn_cast<LoadSDNode>(LHS.getOperand(0));
865+
auto *LHS1 = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
866+
return LHS0 && LHS1 && RHSLd && LHS0->getExtensionType() == ISD::ZEXTLOAD &&
867+
LHS1->getAPIntValue() == LHS0->getMemoryVT().getScalarSizeInBits() &&
868+
RHSLd->getExtensionType() == ISD::ZEXTLOAD;
869+
};
870+
SDValue LHS = N->getOperand(0).getOperand(0);
871+
SDValue RHS = N->getOperand(0).getOperand(1);
872+
return !(IsShiftAndLoad(LHS, RHS) || IsShiftAndLoad(RHS, LHS));
873+
}
874+
842875
//===---------------------------------------------------------------------===//
843876
// TargetLowering Callbacks
844877
//===---------------------------------------------------------------------===//

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -173,6 +173,9 @@ class AMDGPUTargetLowering : public TargetLowering {
173173

174174
bool isNarrowingProfitable(EVT VT1, EVT VT2) const override;
175175

176+
bool isDesirableToCommuteWithShift(const SDNode *N,
177+
CombineLevel Level) const override;
178+
176179
EVT getTypeForExtReturn(LLVMContext &Context, EVT VT,
177180
ISD::NodeType ExtendKind) const override;
178181

llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll

Lines changed: 9 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1448,24 +1448,23 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* n
14481448
; SI-NEXT: s_mov_b32 s7, s3
14491449
; SI-NEXT: s_waitcnt vmcnt(0)
14501450
; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4
1451-
; SI-NEXT: v_lshrrev_b32_e32 v6, 24, v4
1452-
; SI-NEXT: v_and_b32_e32 v7, 0xff00, v4
14531451
; SI-NEXT: v_cvt_f32_ubyte3_e32 v3, v4
14541452
; SI-NEXT: v_cvt_f32_ubyte2_e32 v2, v4
14551453
; SI-NEXT: v_cvt_f32_ubyte1_e32 v1, v4
14561454
; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v4
1457-
; SI-NEXT: v_add_i32_e32 v4, vcc, 9, v4
1455+
; SI-NEXT: v_add_i32_e32 v7, vcc, 9, v4
1456+
; SI-NEXT: v_and_b32_e32 v6, 0xff00, v4
14581457
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
14591458
; SI-NEXT: s_waitcnt expcnt(0)
1460-
; SI-NEXT: v_and_b32_e32 v0, 0xff, v4
1461-
; SI-NEXT: v_add_i32_e32 v2, vcc, 9, v5
1462-
; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v6
1463-
; SI-NEXT: v_or_b32_e32 v0, v7, v0
1464-
; SI-NEXT: v_and_b32_e32 v2, 0xff, v2
1459+
; SI-NEXT: v_and_b32_e32 v0, 0xff, v7
1460+
; SI-NEXT: v_add_i32_e32 v1, vcc, 9, v5
1461+
; SI-NEXT: v_or_b32_e32 v0, v6, v0
1462+
; SI-NEXT: v_and_b32_e32 v1, 0xff, v1
1463+
; SI-NEXT: v_and_b32_e32 v4, 0xff000000, v4
14651464
; SI-NEXT: v_add_i32_e32 v0, vcc, 0x900, v0
1466-
; SI-NEXT: v_or_b32_e32 v1, v1, v2
1467-
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
14681465
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
1466+
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
1467+
; SI-NEXT: v_or_b32_e32 v1, v4, v1
14691468
; SI-NEXT: v_or_b32_e32 v0, v1, v0
14701469
; SI-NEXT: v_add_i32_e32 v0, vcc, 0x9000000, v0
14711470
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0

llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -151,26 +151,26 @@ define i32 @global_load_2xi16_align1(i16 addrspace(1)* %p) #0 {
151151
; GFX7-ALIGNED-LABEL: global_load_2xi16_align1:
152152
; GFX7-ALIGNED: ; %bb.0:
153153
; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
154-
; GFX7-ALIGNED-NEXT: v_add_i32_e32 v2, vcc, 1, v0
155-
; GFX7-ALIGNED-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
156-
; GFX7-ALIGNED-NEXT: flat_load_ubyte v4, v[2:3]
157-
; GFX7-ALIGNED-NEXT: v_add_i32_e32 v2, vcc, 3, v0
154+
; GFX7-ALIGNED-NEXT: v_add_i32_e32 v2, vcc, 2, v0
158155
; GFX7-ALIGNED-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
159-
; GFX7-ALIGNED-NEXT: flat_load_ubyte v5, v[0:1]
156+
; GFX7-ALIGNED-NEXT: v_add_i32_e32 v4, vcc, 1, v0
157+
; GFX7-ALIGNED-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
158+
; GFX7-ALIGNED-NEXT: v_add_i32_e32 v6, vcc, 3, v0
159+
; GFX7-ALIGNED-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc
160+
; GFX7-ALIGNED-NEXT: flat_load_ubyte v6, v[6:7]
161+
; GFX7-ALIGNED-NEXT: flat_load_ubyte v4, v[4:5]
160162
; GFX7-ALIGNED-NEXT: flat_load_ubyte v2, v[2:3]
161-
; GFX7-ALIGNED-NEXT: v_add_i32_e32 v0, vcc, 2, v0
162-
; GFX7-ALIGNED-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
163163
; GFX7-ALIGNED-NEXT: flat_load_ubyte v0, v[0:1]
164164
; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(3)
165-
; GFX7-ALIGNED-NEXT: v_lshlrev_b32_e32 v1, 8, v4
165+
; GFX7-ALIGNED-NEXT: v_lshlrev_b32_e32 v3, 24, v6
166166
; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(2)
167-
; GFX7-ALIGNED-NEXT: v_or_b32_e32 v1, v1, v5
167+
; GFX7-ALIGNED-NEXT: v_lshlrev_b32_e32 v1, 8, v4
168168
; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(1)
169-
; GFX7-ALIGNED-NEXT: v_lshlrev_b32_e32 v2, 8, v2
169+
; GFX7-ALIGNED-NEXT: v_lshlrev_b32_e32 v2, 16, v2
170170
; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0)
171-
; GFX7-ALIGNED-NEXT: v_or_b32_e32 v0, v2, v0
172-
; GFX7-ALIGNED-NEXT: v_lshlrev_b32_e32 v0, 16, v0
173171
; GFX7-ALIGNED-NEXT: v_or_b32_e32 v0, v1, v0
172+
; GFX7-ALIGNED-NEXT: v_or_b32_e32 v1, v3, v2
173+
; GFX7-ALIGNED-NEXT: v_or_b32_e32 v0, v0, v1
174174
; GFX7-ALIGNED-NEXT: s_setpc_b64 s[30:31]
175175
;
176176
; GFX7-UNALIGNED-LABEL: global_load_2xi16_align1:

llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -208,19 +208,19 @@ define i32 @private_load_2xi16_align1(i16 addrspace(5)* %p) #0 {
208208
; GFX7-ALIGNED-NEXT: v_add_i32_e32 v1, vcc, 2, v0
209209
; GFX7-ALIGNED-NEXT: v_add_i32_e32 v2, vcc, 1, v0
210210
; GFX7-ALIGNED-NEXT: v_add_i32_e32 v3, vcc, 3, v0
211-
; GFX7-ALIGNED-NEXT: buffer_load_ubyte v2, v2, s[0:3], 0 offen
212211
; GFX7-ALIGNED-NEXT: buffer_load_ubyte v3, v3, s[0:3], 0 offen
213-
; GFX7-ALIGNED-NEXT: buffer_load_ubyte v0, v0, s[0:3], 0 offen
212+
; GFX7-ALIGNED-NEXT: buffer_load_ubyte v2, v2, s[0:3], 0 offen
214213
; GFX7-ALIGNED-NEXT: buffer_load_ubyte v1, v1, s[0:3], 0 offen
214+
; GFX7-ALIGNED-NEXT: buffer_load_ubyte v0, v0, s[0:3], 0 offen
215215
; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(3)
216-
; GFX7-ALIGNED-NEXT: v_lshlrev_b32_e32 v2, 8, v2
216+
; GFX7-ALIGNED-NEXT: v_lshlrev_b32_e32 v3, 24, v3
217217
; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(2)
218-
; GFX7-ALIGNED-NEXT: v_lshlrev_b32_e32 v3, 8, v3
218+
; GFX7-ALIGNED-NEXT: v_lshlrev_b32_e32 v2, 8, v2
219219
; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(1)
220-
; GFX7-ALIGNED-NEXT: v_or_b32_e32 v0, v2, v0
220+
; GFX7-ALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1
221221
; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0)
222+
; GFX7-ALIGNED-NEXT: v_or_b32_e32 v0, v2, v0
222223
; GFX7-ALIGNED-NEXT: v_or_b32_e32 v1, v3, v1
223-
; GFX7-ALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1
224224
; GFX7-ALIGNED-NEXT: v_or_b32_e32 v0, v0, v1
225225
; GFX7-ALIGNED-NEXT: s_setpc_b64 s[30:31]
226226
;

llvm/test/CodeGen/AMDGPU/idot8s.ll

Lines changed: 22 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -2818,50 +2818,50 @@ define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
28182818
; GFX7-NEXT: buffer_load_ubyte v1, off, s[0:3], 0
28192819
; GFX7-NEXT: s_addc_u32 s13, s13, 0
28202820
; GFX7-NEXT: s_waitcnt vmcnt(2)
2821-
; GFX7-NEXT: v_bfe_i32 v7, v2, 0, 4
2822-
; GFX7-NEXT: v_bfe_i32 v3, v2, 24, 4
2821+
; GFX7-NEXT: v_bfe_i32 v8, v2, 0, 4
2822+
; GFX7-NEXT: v_ashrrev_i32_e32 v3, 28, v2
28232823
; GFX7-NEXT: s_waitcnt vmcnt(1)
2824-
; GFX7-NEXT: v_bfe_i32 v14, v0, 0, 4
2825-
; GFX7-NEXT: v_bfe_i32 v4, v2, 20, 4
2826-
; GFX7-NEXT: v_bfe_i32 v5, v2, 16, 4
2827-
; GFX7-NEXT: v_bfe_i32 v6, v2, 8, 4
2828-
; GFX7-NEXT: v_ashrrev_i32_e32 v8, 28, v2
2824+
; GFX7-NEXT: v_bfe_i32 v15, v0, 0, 4
2825+
; GFX7-NEXT: v_bfe_i32 v4, v2, 24, 4
2826+
; GFX7-NEXT: v_bfe_i32 v5, v2, 20, 4
2827+
; GFX7-NEXT: v_bfe_i32 v6, v2, 16, 4
2828+
; GFX7-NEXT: v_bfe_i32 v7, v2, 8, 4
28292829
; GFX7-NEXT: v_bfe_i32 v9, v2, 12, 4
28302830
; GFX7-NEXT: v_bfe_i32 v2, v2, 4, 4
2831-
; GFX7-NEXT: v_and_b32_e32 v7, 0xff, v7
2832-
; GFX7-NEXT: v_bfe_i32 v10, v0, 24, 4
2833-
; GFX7-NEXT: v_bfe_i32 v11, v0, 20, 4
2834-
; GFX7-NEXT: v_bfe_i32 v12, v0, 16, 4
2835-
; GFX7-NEXT: v_bfe_i32 v13, v0, 8, 4
2836-
; GFX7-NEXT: v_ashrrev_i32_e32 v15, 28, v0
2831+
; GFX7-NEXT: v_and_b32_e32 v8, 0xff, v8
2832+
; GFX7-NEXT: v_ashrrev_i32_e32 v10, 28, v0
2833+
; GFX7-NEXT: v_bfe_i32 v11, v0, 24, 4
2834+
; GFX7-NEXT: v_bfe_i32 v12, v0, 20, 4
2835+
; GFX7-NEXT: v_bfe_i32 v13, v0, 16, 4
2836+
; GFX7-NEXT: v_bfe_i32 v14, v0, 8, 4
28372837
; GFX7-NEXT: v_bfe_i32 v16, v0, 12, 4
28382838
; GFX7-NEXT: v_bfe_i32 v0, v0, 4, 4
2839-
; GFX7-NEXT: v_and_b32_e32 v14, 0xff, v14
2839+
; GFX7-NEXT: v_and_b32_e32 v15, 0xff, v15
28402840
; GFX7-NEXT: v_and_b32_e32 v2, 0xff, v2
28412841
; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0
28422842
; GFX7-NEXT: s_waitcnt vmcnt(0)
2843-
; GFX7-NEXT: v_mad_u32_u24 v1, v7, v14, v1
2844-
; GFX7-NEXT: v_and_b32_e32 v6, 0xff, v6
2843+
; GFX7-NEXT: v_mad_u32_u24 v1, v8, v15, v1
2844+
; GFX7-NEXT: v_and_b32_e32 v7, 0xff, v7
28452845
; GFX7-NEXT: v_lshlrev_b32_e32 v9, 24, v9
2846-
; GFX7-NEXT: v_and_b32_e32 v13, 0xff, v13
2846+
; GFX7-NEXT: v_and_b32_e32 v14, 0xff, v14
28472847
; GFX7-NEXT: v_lshlrev_b32_e32 v16, 24, v16
28482848
; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1
28492849
; GFX7-NEXT: v_alignbit_b32 v9, 0, v9, 24
28502850
; GFX7-NEXT: v_alignbit_b32 v16, 0, v16, 24
2851-
; GFX7-NEXT: v_mad_u32_u24 v0, v6, v13, v0
2851+
; GFX7-NEXT: v_mad_u32_u24 v0, v7, v14, v0
2852+
; GFX7-NEXT: v_and_b32_e32 v6, 0xff, v6
2853+
; GFX7-NEXT: v_and_b32_e32 v13, 0xff, v13
2854+
; GFX7-NEXT: v_mad_u32_u24 v0, v9, v16, v0
28522855
; GFX7-NEXT: v_and_b32_e32 v5, 0xff, v5
28532856
; GFX7-NEXT: v_and_b32_e32 v12, 0xff, v12
2854-
; GFX7-NEXT: v_mad_u32_u24 v0, v9, v16, v0
2857+
; GFX7-NEXT: v_mad_u32_u24 v0, v6, v13, v0
28552858
; GFX7-NEXT: v_and_b32_e32 v4, 0xff, v4
28562859
; GFX7-NEXT: v_and_b32_e32 v11, 0xff, v11
28572860
; GFX7-NEXT: v_mad_u32_u24 v0, v5, v12, v0
28582861
; GFX7-NEXT: v_and_b32_e32 v3, 0xff, v3
28592862
; GFX7-NEXT: v_and_b32_e32 v10, 0xff, v10
28602863
; GFX7-NEXT: v_mad_u32_u24 v0, v4, v11, v0
2861-
; GFX7-NEXT: v_and_b32_e32 v8, 0xff, v8
2862-
; GFX7-NEXT: v_and_b32_e32 v15, 0xff, v15
28632864
; GFX7-NEXT: v_mad_u32_u24 v0, v3, v10, v0
2864-
; GFX7-NEXT: v_mad_u32_u24 v0, v8, v15, v0
28652865
; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0
28662866
; GFX7-NEXT: s_endpgm
28672867
;

llvm/test/CodeGen/AMDGPU/idot8u.ll

Lines changed: 25 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -2444,40 +2444,36 @@ define amdgpu_kernel void @udot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
24442444
; GFX7-NEXT: buffer_load_ubyte v1, off, s[0:3], 0
24452445
; GFX7-NEXT: s_addc_u32 s13, s13, 0
24462446
; GFX7-NEXT: s_waitcnt vmcnt(2)
2447-
; GFX7-NEXT: v_and_b32_e32 v7, 15, v2
2448-
; GFX7-NEXT: v_bfe_u32 v6, v2, 4, 4
2447+
; GFX7-NEXT: v_and_b32_e32 v8, 15, v2
2448+
; GFX7-NEXT: v_bfe_u32 v3, v2, 24, 4
24492449
; GFX7-NEXT: s_waitcnt vmcnt(1)
2450-
; GFX7-NEXT: v_and_b32_e32 v14, 15, v0
2451-
; GFX7-NEXT: v_bfe_u32 v8, v2, 12, 4
2452-
; GFX7-NEXT: v_bfe_u32 v13, v0, 4, 4
2453-
; GFX7-NEXT: v_bfe_u32 v15, v0, 12, 4
2450+
; GFX7-NEXT: v_and_b32_e32 v15, 15, v0
2451+
; GFX7-NEXT: v_bfe_u32 v4, v2, 20, 4
2452+
; GFX7-NEXT: v_bfe_u32 v5, v2, 16, 4
2453+
; GFX7-NEXT: v_bfe_u32 v6, v2, 8, 4
2454+
; GFX7-NEXT: v_bfe_u32 v7, v2, 4, 4
2455+
; GFX7-NEXT: v_lshrrev_b32_e32 v9, 28, v2
2456+
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 12, v2
2457+
; GFX7-NEXT: v_bfe_u32 v10, v0, 24, 4
2458+
; GFX7-NEXT: v_bfe_u32 v11, v0, 20, 4
2459+
; GFX7-NEXT: v_bfe_u32 v12, v0, 16, 4
2460+
; GFX7-NEXT: v_bfe_u32 v13, v0, 8, 4
2461+
; GFX7-NEXT: v_bfe_u32 v14, v0, 4, 4
2462+
; GFX7-NEXT: v_lshrrev_b32_e32 v16, 28, v0
2463+
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 12, v0
24542464
; GFX7-NEXT: s_waitcnt vmcnt(0)
2465+
; GFX7-NEXT: v_mad_u32_u24 v1, v8, v15, v1
2466+
; GFX7-NEXT: v_and_b32_e32 v2, 0xf000000, v2
2467+
; GFX7-NEXT: v_and_b32_e32 v0, 0xf000000, v0
24552468
; GFX7-NEXT: v_mad_u32_u24 v1, v7, v14, v1
2456-
; GFX7-NEXT: v_bfe_u32 v5, v2, 8, 4
2457-
; GFX7-NEXT: v_bfe_u32 v12, v0, 8, 4
2458-
; GFX7-NEXT: v_lshlrev_b32_e32 v8, 24, v8
2459-
; GFX7-NEXT: v_lshlrev_b32_e32 v15, 24, v15
2469+
; GFX7-NEXT: v_alignbit_b32 v2, s10, v2, 24
2470+
; GFX7-NEXT: v_alignbit_b32 v0, 0, v0, 24
24602471
; GFX7-NEXT: v_mad_u32_u24 v1, v6, v13, v1
2461-
; GFX7-NEXT: v_alignbit_b32 v8, 0, v8, 24
2462-
; GFX7-NEXT: v_alignbit_b32 v14, 0, v15, 24
2463-
; GFX7-NEXT: v_mad_u32_u24 v1, v5, v12, v1
2464-
; GFX7-NEXT: v_bfe_u32 v4, v2, 16, 4
2465-
; GFX7-NEXT: v_lshrrev_b32_e32 v9, 28, v2
2466-
; GFX7-NEXT: v_bfe_u32 v11, v0, 16, 4
2467-
; GFX7-NEXT: v_lshrrev_b32_e32 v16, 28, v0
2468-
; GFX7-NEXT: v_mad_u32_u24 v1, v8, v14, v1
2469-
; GFX7-NEXT: v_bfe_u32 v3, v2, 20, 4
2470-
; GFX7-NEXT: v_bfe_u32 v10, v0, 20, 4
2471-
; GFX7-NEXT: v_alignbit_b32 v2, v9, v2, 24
2472-
; GFX7-NEXT: v_alignbit_b32 v0, v16, v0, 24
2473-
; GFX7-NEXT: v_mad_u32_u24 v1, v4, v11, v1
2474-
; GFX7-NEXT: v_lshrrev_b32_e32 v9, 8, v2
2475-
; GFX7-NEXT: v_and_b32_e32 v2, 15, v2
2476-
; GFX7-NEXT: v_lshrrev_b32_e32 v7, 8, v0
2477-
; GFX7-NEXT: v_and_b32_e32 v0, 15, v0
2478-
; GFX7-NEXT: v_mad_u32_u24 v1, v3, v10, v1
24792472
; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1
2480-
; GFX7-NEXT: v_mad_u32_u24 v0, v9, v7, v0
2473+
; GFX7-NEXT: v_mad_u32_u24 v0, v5, v12, v0
2474+
; GFX7-NEXT: v_mad_u32_u24 v0, v4, v11, v0
2475+
; GFX7-NEXT: v_mad_u32_u24 v0, v3, v10, v0
2476+
; GFX7-NEXT: v_mad_u32_u24 v0, v9, v16, v0
24812477
; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0
24822478
; GFX7-NEXT: s_endpgm
24832479
;

0 commit comments

Comments
 (0)