Skip to content

Commit

Permalink
Revert "[DAGCombiner] handle more store value forwarding"
Browse files Browse the repository at this point in the history
This reverts commit f35a09d.

Causes miscompiles, see D138899
  • Loading branch information
aeubanks committed Feb 14, 2023
1 parent ac6219d commit 7c6b46e
Show file tree
Hide file tree
Showing 11 changed files with 263 additions and 221 deletions.
59 changes: 11 additions & 48 deletions llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
Expand Up @@ -362,7 +362,6 @@ namespace {
SDValue SplitIndexingFromLoad(LoadSDNode *LD);
bool SliceUpLoad(SDNode *N);

StoreSDNode *getUniqueStoreFeeding(LoadSDNode *LD, int64_t &Offset);
// Scalars have size 0 to distinguish from singleton vectors.
SDValue ForwardStoreValueToDirectLoad(LoadSDNode *LD);
bool getTruncatedStoreValue(StoreSDNode *ST, SDValue &Val);
Expand Down Expand Up @@ -17614,53 +17613,11 @@ bool DAGCombiner::extendLoadedValueToExtension(LoadSDNode *LD, SDValue &Val) {
return false;
}

StoreSDNode *DAGCombiner::getUniqueStoreFeeding(LoadSDNode *LD,
int64_t &Offset) {
SDValue Chain = LD->getOperand(0);

// Look through CALLSEQ_START.
if (Chain.getOpcode() == ISD::CALLSEQ_START)
Chain = Chain->getOperand(0);

StoreSDNode *ST = nullptr;
SmallVector<SDValue, 8> Aliases;
if (Chain.getOpcode() == ISD::TokenFactor) {
// Look for unique store within the TokenFactor.
for (SDValue Op : Chain->ops()) {
StoreSDNode *Store = dyn_cast<StoreSDNode>(Op.getNode());
if (!Store)
continue;
BaseIndexOffset BasePtrLD = BaseIndexOffset::match(LD, DAG);
BaseIndexOffset BasePtrST = BaseIndexOffset::match(Store, DAG);
if (BasePtrST.equalBaseIndex(BasePtrLD, DAG, Offset)) {
// Make sure the store is not aliased with any nodes in TokenFactor.
GatherAllAliases(Store, Chain, Aliases);
if (Aliases.empty() ||
(Aliases.size() == 1 && Aliases.front().getNode() == Store))
ST = Store;
break;
}
}
} else {
StoreSDNode *Store = dyn_cast<StoreSDNode>(Chain.getNode());
if (Store) {
BaseIndexOffset BasePtrLD = BaseIndexOffset::match(LD, DAG);
BaseIndexOffset BasePtrST = BaseIndexOffset::match(Store, DAG);
if (BasePtrST.equalBaseIndex(BasePtrLD, DAG, Offset))
ST = Store;
}
}

return ST;
}

SDValue DAGCombiner::ForwardStoreValueToDirectLoad(LoadSDNode *LD) {
if (OptLevel == CodeGenOpt::None || !LD->isSimple())
return SDValue();
SDValue InputChain = LD->getOperand(0);
int64_t Offset;

StoreSDNode *ST = getUniqueStoreFeeding(LD, Offset);
SDValue Chain = LD->getOperand(0);
StoreSDNode *ST = dyn_cast<StoreSDNode>(Chain.getNode());
// TODO: Relax this restriction for unordered atomics (see D66309)
if (!ST || !ST->isSimple() || ST->getAddressSpace() != LD->getAddressSpace())
return SDValue();
Expand Down Expand Up @@ -17688,6 +17645,12 @@ SDValue DAGCombiner::ForwardStoreValueToDirectLoad(LoadSDNode *LD) {
if (LdStScalable && DAG.getDataLayout().isBigEndian())
return SDValue();

BaseIndexOffset BasePtrLD = BaseIndexOffset::match(LD, DAG);
BaseIndexOffset BasePtrST = BaseIndexOffset::match(ST, DAG);
int64_t Offset;
if (!BasePtrST.equalBaseIndex(BasePtrLD, DAG, Offset))
return SDValue();

// Normalize for Endianness. After this Offset=0 will denote that the least
// significant bit in the loaded value maps to the least significant bit in
// the stored value). With Offset=n (for n > 0) the loaded value starts at the
Expand Down Expand Up @@ -17730,7 +17693,7 @@ SDValue DAGCombiner::ForwardStoreValueToDirectLoad(LoadSDNode *LD) {
if (Offset == 0 && LDType == STType && STMemType == LDMemType) {
// Simple case: Direct non-truncating forwarding
if (LDType.getSizeInBits() == LdMemSize)
return ReplaceLd(LD, ST->getValue(), InputChain);
return ReplaceLd(LD, ST->getValue(), Chain);
// Can we model the truncate and extension with an and mask?
if (STType.isInteger() && LDMemType.isInteger() && !STType.isVector() &&
!LDMemType.isVector() && LD->getExtensionType() != ISD::SEXTLOAD) {
Expand All @@ -17740,7 +17703,7 @@ SDValue DAGCombiner::ForwardStoreValueToDirectLoad(LoadSDNode *LD) {
StMemSize.getFixedValue()),
SDLoc(ST), STType);
auto Val = DAG.getNode(ISD::AND, SDLoc(LD), LDType, ST->getValue(), Mask);
return ReplaceLd(LD, Val, InputChain);
return ReplaceLd(LD, Val, Chain);
}
}

Expand Down Expand Up @@ -17777,7 +17740,7 @@ SDValue DAGCombiner::ForwardStoreValueToDirectLoad(LoadSDNode *LD) {
}
if (!extendLoadedValueToExtension(LD, Val))
continue;
return ReplaceLd(LD, Val, InputChain);
return ReplaceLd(LD, Val, Chain);
} while (false);

// On failure, cleanup dead nodes we may have created.
Expand Down
38 changes: 21 additions & 17 deletions llvm/test/CodeGen/AMDGPU/ctpop16.ll
Expand Up @@ -457,50 +457,54 @@ define amdgpu_kernel void @v_ctpop_v4i16(ptr addrspace(1) noalias %out, ptr addr
;
; EG-LABEL: v_ctpop_v4i16:
; EG: ; %bb.0:
; EG-NEXT: ALU 3, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
; EG-NEXT: ALU 37, @12, KC0[CB0:0-32], KC1[]
; EG-NEXT: ALU 42, @11, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T8.XY, T0.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 6:
; EG-NEXT: VTX_READ_64 T8.XY, T0.X, 0, #1
; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1
; EG-NEXT: ALU clause starting at 8:
; EG-NEXT: MOV T0.Y, T4.X,
; EG-NEXT: LSHL * T0.W, T0.X, literal.x, BS:VEC_120/SCL_212
; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
; EG-NEXT: ALU clause starting at 12:
; EG-NEXT: AND_INT * T0.W, T8.X, literal.x,
; EG-NEXT: ALU clause starting at 11:
; EG-NEXT: MOV T2.X, T0.X,
; EG-NEXT: MOV * T3.X, T0.Y,
; EG-NEXT: MOV T0.X, T4.X,
; EG-NEXT: MOV * T0.Y, PV.X,
; EG-NEXT: AND_INT * T0.W, PV.Y, literal.x,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: BCNT_INT T0.W, PV.W,
; EG-NEXT: AND_INT * T1.W, T0.Y, literal.x,
; EG-NEXT: AND_INT * T1.W, T0.X, literal.x,
; EG-NEXT: -65536(nan), 0(0.000000e+00)
; EG-NEXT: OR_INT * T0.W, PS, PV.W,
; EG-NEXT: MOV T0.X, T3.X,
; EG-NEXT: MOV * T4.X, PV.W,
; EG-NEXT: MOV T0.X, PV.X,
; EG-NEXT: LSHR * T0.W, T8.X, literal.x,
; EG-NEXT: MOV T0.Z, PS,
; EG-NEXT: LSHR * T0.W, T0.Y, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: BCNT_INT T0.W, PV.W,
; EG-NEXT: AND_INT * T1.W, PV.X, literal.x,
; EG-NEXT: AND_INT * T1.W, PV.Z, literal.x,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
; EG-NEXT: MOV T4.X, PV.W,
; EG-NEXT: MOV * T0.X, T5.X,
; EG-NEXT: AND_INT * T0.W, T8.Y, literal.x,
; EG-NEXT: MOV T0.Y, T5.X,
; EG-NEXT: AND_INT * T0.W, T0.X, literal.x, BS:VEC_120/SCL_212
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: BCNT_INT T0.W, PV.W,
; EG-NEXT: AND_INT * T1.W, T0.X, literal.x,
; EG-NEXT: AND_INT * T1.W, PV.Y, literal.x,
; EG-NEXT: -65536(nan), 0(0.000000e+00)
; EG-NEXT: OR_INT * T0.W, PS, PV.W,
; EG-NEXT: MOV * T5.X, PV.W,
; EG-NEXT: MOV T0.X, PV.X,
; EG-NEXT: LSHR * T0.W, T8.Y, literal.x,
; EG-NEXT: MOV T0.Y, PV.X,
; EG-NEXT: LSHR * T0.W, T0.X, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: BCNT_INT T0.W, PV.W,
; EG-NEXT: AND_INT * T1.W, PV.X, literal.x,
; EG-NEXT: AND_INT * T1.W, PV.Y, literal.x,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
Expand Down
80 changes: 48 additions & 32 deletions llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
Expand Up @@ -1249,7 +1249,7 @@ define amdgpu_kernel void @constant_zextload_v4i16_to_v4i32(ptr addrspace(1) %ou
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
; EG-NEXT: ALU 8, @9, KC0[CB0:0-32], KC1[]
; EG-NEXT: ALU 12, @9, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T6.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
Expand All @@ -1258,13 +1258,17 @@ define amdgpu_kernel void @constant_zextload_v4i16_to_v4i32(ptr addrspace(1) %ou
; EG-NEXT: ALU clause starting at 8:
; EG-NEXT: MOV * T5.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 9:
; EG-NEXT: LSHR * T5.W, T5.Y, literal.x,
; EG-NEXT: MOV T2.X, T5.X,
; EG-NEXT: MOV * T3.X, T5.Y,
; EG-NEXT: MOV T0.Y, PV.X,
; EG-NEXT: MOV * T0.Z, PS,
; EG-NEXT: LSHR * T5.W, PV.Z, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: AND_INT * T5.Z, T5.Y, literal.x,
; EG-NEXT: AND_INT * T5.Z, T0.Z, literal.x,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: LSHR * T5.Y, T5.X, literal.x,
; EG-NEXT: LSHR * T5.Y, T0.Y, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: AND_INT T5.X, T5.X, literal.x,
; EG-NEXT: AND_INT T5.X, T0.Y, literal.x,
; EG-NEXT: LSHR * T6.X, KC0[2].Y, literal.y,
; EG-NEXT: 65535(9.183409e-41), 2(2.802597e-45)
%load = load <4 x i16>, ptr addrspace(4) %in
Expand Down Expand Up @@ -1338,25 +1342,29 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i32(ptr addrspace(1) %ou
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
; EG-NEXT: ALU 10, @9, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T5.X, 1
; EG-NEXT: ALU 14, @9, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T6.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 6:
; EG-NEXT: VTX_READ_64 T5.XY, T5.X, 0, #1
; EG-NEXT: ALU clause starting at 8:
; EG-NEXT: MOV * T5.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 9:
; EG-NEXT: BFE_INT * T6.Z, T5.Y, 0.0, literal.x,
; EG-NEXT: MOV T2.X, T5.X,
; EG-NEXT: MOV * T3.X, T5.Y,
; EG-NEXT: MOV T0.Y, PV.X,
; EG-NEXT: MOV * T0.Z, PS,
; EG-NEXT: BFE_INT * T5.Z, PV.Z, 0.0, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: BFE_INT T6.X, T5.X, 0.0, literal.x,
; EG-NEXT: LSHR * T0.W, T5.Y, literal.x,
; EG-NEXT: BFE_INT T5.X, T0.Y, 0.0, literal.x,
; EG-NEXT: LSHR * T0.W, T0.Z, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: BFE_INT T6.W, PV.W, 0.0, literal.x,
; EG-NEXT: LSHR * T0.W, T5.X, literal.x,
; EG-NEXT: BFE_INT T5.W, PV.W, 0.0, literal.x,
; EG-NEXT: LSHR * T0.W, T0.Y, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: LSHR T5.X, KC0[2].Y, literal.x,
; EG-NEXT: BFE_INT * T6.Y, PS, 0.0, literal.y,
; EG-NEXT: LSHR T6.X, KC0[2].Y, literal.x,
; EG-NEXT: BFE_INT * T5.Y, PS, 0.0, literal.y,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
%load = load <4 x i16>, ptr addrspace(4) %in
%ext = sext <4 x i16> %load to <4 x i32>
Expand Down Expand Up @@ -4871,25 +4879,29 @@ define amdgpu_kernel void @constant_zextload_v4i16_to_v4i64(ptr addrspace(1) %ou
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
; EG-NEXT: ALU 14, @9, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T8.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T7.X, 1
; EG-NEXT: ALU 18, @9, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T8.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T7.X, 1
; EG-NEXT: CF_END
; EG-NEXT: Fetch clause starting at 6:
; EG-NEXT: VTX_READ_64 T5.XY, T5.X, 0, #1
; EG-NEXT: ALU clause starting at 8:
; EG-NEXT: MOV * T5.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 9:
; EG-NEXT: LSHR * T6.Z, T5.Y, literal.x,
; EG-NEXT: MOV T2.X, T5.X,
; EG-NEXT: MOV * T3.X, T5.Y,
; EG-NEXT: MOV T0.Y, PV.X,
; EG-NEXT: MOV * T0.Z, PS,
; EG-NEXT: LSHR * T5.Z, PV.Z, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: AND_INT T6.X, T5.Y, literal.x,
; EG-NEXT: MOV T6.Y, 0.0,
; EG-NEXT: LSHR T5.Z, T5.X, literal.y,
; EG-NEXT: AND_INT * T5.X, T5.X, literal.x,
; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
; EG-NEXT: AND_INT T5.X, T0.Z, literal.x,
; EG-NEXT: MOV T5.Y, 0.0,
; EG-NEXT: MOV T6.W, 0.0,
; EG-NEXT: MOV * T5.W, 0.0,
; EG-NEXT: LSHR T6.Z, T0.Y, literal.y,
; EG-NEXT: AND_INT * T6.X, T0.Y, literal.x,
; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
; EG-NEXT: MOV T6.Y, 0.0,
; EG-NEXT: MOV T5.W, 0.0,
; EG-NEXT: MOV * T6.W, 0.0,
; EG-NEXT: LSHR T7.X, KC0[2].Y, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
Expand Down Expand Up @@ -4991,7 +5003,7 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i64(ptr addrspace(1) %ou
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
; EG-NEXT: ALU 16, @9, KC0[CB0:0-32], KC1[]
; EG-NEXT: ALU 20, @9, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T8.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T6.X, 1
; EG-NEXT: CF_END
Expand All @@ -5000,17 +5012,21 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i64(ptr addrspace(1) %ou
; EG-NEXT: ALU clause starting at 8:
; EG-NEXT: MOV * T5.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 9:
; EG-NEXT: ASHR * T5.W, T5.X, literal.x,
; EG-NEXT: MOV T2.X, T5.X,
; EG-NEXT: MOV * T3.X, T5.Y,
; EG-NEXT: MOV T0.Y, PS,
; EG-NEXT: MOV * T0.Z, PV.X,
; EG-NEXT: ASHR * T5.W, PV.Z, literal.x,
; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
; EG-NEXT: LSHR T6.X, KC0[2].Y, literal.x,
; EG-NEXT: ASHR T5.Z, T5.X, literal.y,
; EG-NEXT: ASHR * T7.W, T5.Y, literal.z,
; EG-NEXT: ASHR T5.Z, T0.Z, literal.y,
; EG-NEXT: ASHR * T7.W, T0.Y, literal.z,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
; EG-NEXT: BFE_INT T5.X, T5.X, 0.0, literal.x,
; EG-NEXT: ASHR * T7.Z, T5.Y, literal.x,
; EG-NEXT: BFE_INT T5.X, T0.Z, 0.0, literal.x,
; EG-NEXT: ASHR * T7.Z, T0.Y, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: BFE_INT T7.X, T5.Y, 0.0, literal.x,
; EG-NEXT: BFE_INT T7.X, T0.Y, 0.0, literal.x,
; EG-NEXT: ASHR T5.Y, PV.X, literal.y,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
Expand Down

0 comments on commit 7c6b46e

Please sign in to comment.