From 7c6b46e87eafbb5150659f16cb211e5d732372ff Mon Sep 17 00:00:00 2001 From: Arthur Eubanks Date: Mon, 13 Feb 2023 19:00:32 -0800 Subject: [PATCH] Revert "[DAGCombiner] handle more store value forwarding" This reverts commit f35a09daebd0a90daa536432e62a2476f708150d. Causes miscompiles, see D138899 --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 59 ++---- llvm/test/CodeGen/AMDGPU/ctpop16.ll | 38 ++-- llvm/test/CodeGen/AMDGPU/load-constant-i16.ll | 80 ++++---- llvm/test/CodeGen/AMDGPU/load-global-i16.ll | 172 +++++++++++------- llvm/test/CodeGen/AMDGPU/shl.ll | 42 +++-- llvm/test/CodeGen/AMDGPU/sra.ll | 55 +++--- llvm/test/CodeGen/Mips/o32_cc_byval.ll | 5 +- llvm/test/CodeGen/PowerPC/aix-cc-byval.ll | 16 +- llvm/test/CodeGen/PowerPC/byval-lhs.ll | 8 +- .../PowerPC/ppc64-byval-larger-struct.ll | 6 + llvm/test/CodeGen/X86/fastcc-byval.ll | 3 +- 11 files changed, 263 insertions(+), 221 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index f4f82db4c9b1e..00ca9d3434d5a 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -362,7 +362,6 @@ namespace { SDValue SplitIndexingFromLoad(LoadSDNode *LD); bool SliceUpLoad(SDNode *N); - StoreSDNode *getUniqueStoreFeeding(LoadSDNode *LD, int64_t &Offset); // Scalars have size 0 to distinguish from singleton vectors. SDValue ForwardStoreValueToDirectLoad(LoadSDNode *LD); bool getTruncatedStoreValue(StoreSDNode *ST, SDValue &Val); @@ -17614,53 +17613,11 @@ bool DAGCombiner::extendLoadedValueToExtension(LoadSDNode *LD, SDValue &Val) { return false; } -StoreSDNode *DAGCombiner::getUniqueStoreFeeding(LoadSDNode *LD, - int64_t &Offset) { - SDValue Chain = LD->getOperand(0); - - // Look through CALLSEQ_START. - if (Chain.getOpcode() == ISD::CALLSEQ_START) - Chain = Chain->getOperand(0); - - StoreSDNode *ST = nullptr; - SmallVector Aliases; - if (Chain.getOpcode() == ISD::TokenFactor) { - // Look for unique store within the TokenFactor. - for (SDValue Op : Chain->ops()) { - StoreSDNode *Store = dyn_cast(Op.getNode()); - if (!Store) - continue; - BaseIndexOffset BasePtrLD = BaseIndexOffset::match(LD, DAG); - BaseIndexOffset BasePtrST = BaseIndexOffset::match(Store, DAG); - if (BasePtrST.equalBaseIndex(BasePtrLD, DAG, Offset)) { - // Make sure the store is not aliased with any nodes in TokenFactor. - GatherAllAliases(Store, Chain, Aliases); - if (Aliases.empty() || - (Aliases.size() == 1 && Aliases.front().getNode() == Store)) - ST = Store; - break; - } - } - } else { - StoreSDNode *Store = dyn_cast(Chain.getNode()); - if (Store) { - BaseIndexOffset BasePtrLD = BaseIndexOffset::match(LD, DAG); - BaseIndexOffset BasePtrST = BaseIndexOffset::match(Store, DAG); - if (BasePtrST.equalBaseIndex(BasePtrLD, DAG, Offset)) - ST = Store; - } - } - - return ST; -} - SDValue DAGCombiner::ForwardStoreValueToDirectLoad(LoadSDNode *LD) { if (OptLevel == CodeGenOpt::None || !LD->isSimple()) return SDValue(); - SDValue InputChain = LD->getOperand(0); - int64_t Offset; - - StoreSDNode *ST = getUniqueStoreFeeding(LD, Offset); + SDValue Chain = LD->getOperand(0); + StoreSDNode *ST = dyn_cast(Chain.getNode()); // TODO: Relax this restriction for unordered atomics (see D66309) if (!ST || !ST->isSimple() || ST->getAddressSpace() != LD->getAddressSpace()) return SDValue(); @@ -17688,6 +17645,12 @@ SDValue DAGCombiner::ForwardStoreValueToDirectLoad(LoadSDNode *LD) { if (LdStScalable && DAG.getDataLayout().isBigEndian()) return SDValue(); + BaseIndexOffset BasePtrLD = BaseIndexOffset::match(LD, DAG); + BaseIndexOffset BasePtrST = BaseIndexOffset::match(ST, DAG); + int64_t Offset; + if (!BasePtrST.equalBaseIndex(BasePtrLD, DAG, Offset)) + return SDValue(); + // Normalize for Endianness. After this Offset=0 will denote that the least // significant bit in the loaded value maps to the least significant bit in // the stored value). With Offset=n (for n > 0) the loaded value starts at the @@ -17730,7 +17693,7 @@ SDValue DAGCombiner::ForwardStoreValueToDirectLoad(LoadSDNode *LD) { if (Offset == 0 && LDType == STType && STMemType == LDMemType) { // Simple case: Direct non-truncating forwarding if (LDType.getSizeInBits() == LdMemSize) - return ReplaceLd(LD, ST->getValue(), InputChain); + return ReplaceLd(LD, ST->getValue(), Chain); // Can we model the truncate and extension with an and mask? if (STType.isInteger() && LDMemType.isInteger() && !STType.isVector() && !LDMemType.isVector() && LD->getExtensionType() != ISD::SEXTLOAD) { @@ -17740,7 +17703,7 @@ SDValue DAGCombiner::ForwardStoreValueToDirectLoad(LoadSDNode *LD) { StMemSize.getFixedValue()), SDLoc(ST), STType); auto Val = DAG.getNode(ISD::AND, SDLoc(LD), LDType, ST->getValue(), Mask); - return ReplaceLd(LD, Val, InputChain); + return ReplaceLd(LD, Val, Chain); } } @@ -17777,7 +17740,7 @@ SDValue DAGCombiner::ForwardStoreValueToDirectLoad(LoadSDNode *LD) { } if (!extendLoadedValueToExtension(LD, Val)) continue; - return ReplaceLd(LD, Val, InputChain); + return ReplaceLd(LD, Val, Chain); } while (false); // On failure, cleanup dead nodes we may have created. diff --git a/llvm/test/CodeGen/AMDGPU/ctpop16.ll b/llvm/test/CodeGen/AMDGPU/ctpop16.ll index 0acd24dc5e3a0..1b2bca5cdc00d 100644 --- a/llvm/test/CodeGen/AMDGPU/ctpop16.ll +++ b/llvm/test/CodeGen/AMDGPU/ctpop16.ll @@ -457,50 +457,54 @@ define amdgpu_kernel void @v_ctpop_v4i16(ptr addrspace(1) noalias %out, ptr addr ; ; EG-LABEL: v_ctpop_v4i16: ; EG: ; %bb.0: -; EG-NEXT: ALU 3, @8, KC0[CB0:0-32], KC1[] +; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 0 @6 -; EG-NEXT: ALU 37, @12, KC0[CB0:0-32], KC1[] +; EG-NEXT: ALU 42, @11, KC0[CB0:0-32], KC1[] ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T8.XY, T0.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD ; EG-NEXT: Fetch clause starting at 6: -; EG-NEXT: VTX_READ_64 T8.XY, T0.X, 0, #1 +; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1 ; EG-NEXT: ALU clause starting at 8: -; EG-NEXT: MOV T0.Y, T4.X, -; EG-NEXT: LSHL * T0.W, T0.X, literal.x, BS:VEC_120/SCL_212 +; EG-NEXT: LSHL * T0.W, T0.X, literal.x, ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) ; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, -; EG-NEXT: ALU clause starting at 12: -; EG-NEXT: AND_INT * T0.W, T8.X, literal.x, +; EG-NEXT: ALU clause starting at 11: +; EG-NEXT: MOV T2.X, T0.X, +; EG-NEXT: MOV * T3.X, T0.Y, +; EG-NEXT: MOV T0.X, T4.X, +; EG-NEXT: MOV * T0.Y, PV.X, +; EG-NEXT: AND_INT * T0.W, PV.Y, literal.x, ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) ; EG-NEXT: BCNT_INT T0.W, PV.W, -; EG-NEXT: AND_INT * T1.W, T0.Y, literal.x, +; EG-NEXT: AND_INT * T1.W, T0.X, literal.x, ; EG-NEXT: -65536(nan), 0(0.000000e+00) ; EG-NEXT: OR_INT * T0.W, PS, PV.W, +; EG-NEXT: MOV T0.X, T3.X, ; EG-NEXT: MOV * T4.X, PV.W, -; EG-NEXT: MOV T0.X, PV.X, -; EG-NEXT: LSHR * T0.W, T8.X, literal.x, +; EG-NEXT: MOV T0.Z, PS, +; EG-NEXT: LSHR * T0.W, T0.Y, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; EG-NEXT: BCNT_INT T0.W, PV.W, -; EG-NEXT: AND_INT * T1.W, PV.X, literal.x, +; EG-NEXT: AND_INT * T1.W, PV.Z, literal.x, ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) ; EG-NEXT: LSHL * T0.W, PV.W, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, ; EG-NEXT: MOV T4.X, PV.W, -; EG-NEXT: MOV * T0.X, T5.X, -; EG-NEXT: AND_INT * T0.W, T8.Y, literal.x, +; EG-NEXT: MOV T0.Y, T5.X, +; EG-NEXT: AND_INT * T0.W, T0.X, literal.x, BS:VEC_120/SCL_212 ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) ; EG-NEXT: BCNT_INT T0.W, PV.W, -; EG-NEXT: AND_INT * T1.W, T0.X, literal.x, +; EG-NEXT: AND_INT * T1.W, PV.Y, literal.x, ; EG-NEXT: -65536(nan), 0(0.000000e+00) ; EG-NEXT: OR_INT * T0.W, PS, PV.W, ; EG-NEXT: MOV * T5.X, PV.W, -; EG-NEXT: MOV T0.X, PV.X, -; EG-NEXT: LSHR * T0.W, T8.Y, literal.x, +; EG-NEXT: MOV T0.Y, PV.X, +; EG-NEXT: LSHR * T0.W, T0.X, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; EG-NEXT: BCNT_INT T0.W, PV.W, -; EG-NEXT: AND_INT * T1.W, PV.X, literal.x, +; EG-NEXT: AND_INT * T1.W, PV.Y, literal.x, ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) ; EG-NEXT: LSHL * T0.W, PV.W, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll index 56e602f6918e7..971ae8ea46d75 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll @@ -1249,7 +1249,7 @@ define amdgpu_kernel void @constant_zextload_v4i16_to_v4i32(ptr addrspace(1) %ou ; EG: ; %bb.0: ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 0 @6 -; EG-NEXT: ALU 8, @9, KC0[CB0:0-32], KC1[] +; EG-NEXT: ALU 12, @9, KC0[CB0:0-32], KC1[] ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T6.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD @@ -1258,13 +1258,17 @@ define amdgpu_kernel void @constant_zextload_v4i16_to_v4i32(ptr addrspace(1) %ou ; EG-NEXT: ALU clause starting at 8: ; EG-NEXT: MOV * T5.X, KC0[2].Z, ; EG-NEXT: ALU clause starting at 9: -; EG-NEXT: LSHR * T5.W, T5.Y, literal.x, +; EG-NEXT: MOV T2.X, T5.X, +; EG-NEXT: MOV * T3.X, T5.Y, +; EG-NEXT: MOV T0.Y, PV.X, +; EG-NEXT: MOV * T0.Z, PS, +; EG-NEXT: LSHR * T5.W, PV.Z, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT * T5.Z, T5.Y, literal.x, +; EG-NEXT: AND_INT * T5.Z, T0.Z, literal.x, ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) -; EG-NEXT: LSHR * T5.Y, T5.X, literal.x, +; EG-NEXT: LSHR * T5.Y, T0.Y, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T5.X, T5.X, literal.x, +; EG-NEXT: AND_INT T5.X, T0.Y, literal.x, ; EG-NEXT: LSHR * T6.X, KC0[2].Y, literal.y, ; EG-NEXT: 65535(9.183409e-41), 2(2.802597e-45) %load = load <4 x i16>, ptr addrspace(4) %in @@ -1338,8 +1342,8 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i32(ptr addrspace(1) %ou ; EG: ; %bb.0: ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 0 @6 -; EG-NEXT: ALU 10, @9, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T5.X, 1 +; EG-NEXT: ALU 14, @9, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T6.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD ; EG-NEXT: Fetch clause starting at 6: @@ -1347,16 +1351,20 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i32(ptr addrspace(1) %ou ; EG-NEXT: ALU clause starting at 8: ; EG-NEXT: MOV * T5.X, KC0[2].Z, ; EG-NEXT: ALU clause starting at 9: -; EG-NEXT: BFE_INT * T6.Z, T5.Y, 0.0, literal.x, +; EG-NEXT: MOV T2.X, T5.X, +; EG-NEXT: MOV * T3.X, T5.Y, +; EG-NEXT: MOV T0.Y, PV.X, +; EG-NEXT: MOV * T0.Z, PS, +; EG-NEXT: BFE_INT * T5.Z, PV.Z, 0.0, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: BFE_INT T6.X, T5.X, 0.0, literal.x, -; EG-NEXT: LSHR * T0.W, T5.Y, literal.x, +; EG-NEXT: BFE_INT T5.X, T0.Y, 0.0, literal.x, +; EG-NEXT: LSHR * T0.W, T0.Z, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: BFE_INT T6.W, PV.W, 0.0, literal.x, -; EG-NEXT: LSHR * T0.W, T5.X, literal.x, +; EG-NEXT: BFE_INT T5.W, PV.W, 0.0, literal.x, +; EG-NEXT: LSHR * T0.W, T0.Y, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: LSHR T5.X, KC0[2].Y, literal.x, -; EG-NEXT: BFE_INT * T6.Y, PS, 0.0, literal.y, +; EG-NEXT: LSHR T6.X, KC0[2].Y, literal.x, +; EG-NEXT: BFE_INT * T5.Y, PS, 0.0, literal.y, ; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44) %load = load <4 x i16>, ptr addrspace(4) %in %ext = sext <4 x i16> %load to <4 x i32> @@ -4871,25 +4879,29 @@ define amdgpu_kernel void @constant_zextload_v4i16_to_v4i64(ptr addrspace(1) %ou ; EG: ; %bb.0: ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 0 @6 -; EG-NEXT: ALU 14, @9, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T8.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T7.X, 1 +; EG-NEXT: ALU 18, @9, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T8.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T7.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: Fetch clause starting at 6: ; EG-NEXT: VTX_READ_64 T5.XY, T5.X, 0, #1 ; EG-NEXT: ALU clause starting at 8: ; EG-NEXT: MOV * T5.X, KC0[2].Z, ; EG-NEXT: ALU clause starting at 9: -; EG-NEXT: LSHR * T6.Z, T5.Y, literal.x, +; EG-NEXT: MOV T2.X, T5.X, +; EG-NEXT: MOV * T3.X, T5.Y, +; EG-NEXT: MOV T0.Y, PV.X, +; EG-NEXT: MOV * T0.Z, PS, +; EG-NEXT: LSHR * T5.Z, PV.Z, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T6.X, T5.Y, literal.x, -; EG-NEXT: MOV T6.Y, 0.0, -; EG-NEXT: LSHR T5.Z, T5.X, literal.y, -; EG-NEXT: AND_INT * T5.X, T5.X, literal.x, -; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) +; EG-NEXT: AND_INT T5.X, T0.Z, literal.x, ; EG-NEXT: MOV T5.Y, 0.0, -; EG-NEXT: MOV T6.W, 0.0, -; EG-NEXT: MOV * T5.W, 0.0, +; EG-NEXT: LSHR T6.Z, T0.Y, literal.y, +; EG-NEXT: AND_INT * T6.X, T0.Y, literal.x, +; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) +; EG-NEXT: MOV T6.Y, 0.0, +; EG-NEXT: MOV T5.W, 0.0, +; EG-NEXT: MOV * T6.W, 0.0, ; EG-NEXT: LSHR T7.X, KC0[2].Y, literal.x, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, ; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44) @@ -4991,7 +5003,7 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i64(ptr addrspace(1) %ou ; EG: ; %bb.0: ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 0 @6 -; EG-NEXT: ALU 16, @9, KC0[CB0:0-32], KC1[] +; EG-NEXT: ALU 20, @9, KC0[CB0:0-32], KC1[] ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T8.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T6.X, 1 ; EG-NEXT: CF_END @@ -5000,17 +5012,21 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i64(ptr addrspace(1) %ou ; EG-NEXT: ALU clause starting at 8: ; EG-NEXT: MOV * T5.X, KC0[2].Z, ; EG-NEXT: ALU clause starting at 9: -; EG-NEXT: ASHR * T5.W, T5.X, literal.x, +; EG-NEXT: MOV T2.X, T5.X, +; EG-NEXT: MOV * T3.X, T5.Y, +; EG-NEXT: MOV T0.Y, PS, +; EG-NEXT: MOV * T0.Z, PV.X, +; EG-NEXT: ASHR * T5.W, PV.Z, literal.x, ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) ; EG-NEXT: LSHR T6.X, KC0[2].Y, literal.x, -; EG-NEXT: ASHR T5.Z, T5.X, literal.y, -; EG-NEXT: ASHR * T7.W, T5.Y, literal.z, +; EG-NEXT: ASHR T5.Z, T0.Z, literal.y, +; EG-NEXT: ASHR * T7.W, T0.Y, literal.z, ; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44) ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) -; EG-NEXT: BFE_INT T5.X, T5.X, 0.0, literal.x, -; EG-NEXT: ASHR * T7.Z, T5.Y, literal.x, +; EG-NEXT: BFE_INT T5.X, T0.Z, 0.0, literal.x, +; EG-NEXT: ASHR * T7.Z, T0.Y, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: BFE_INT T7.X, T5.Y, 0.0, literal.x, +; EG-NEXT: BFE_INT T7.X, T0.Y, 0.0, literal.x, ; EG-NEXT: ASHR T5.Y, PV.X, literal.y, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44) diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll index 08ff26302066f..0c433240f5f95 100644 --- a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll @@ -1636,7 +1636,7 @@ define amdgpu_kernel void @global_zextload_v4i16_to_v4i32(ptr addrspace(1) %out, ; EG: ; %bb.0: ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 0 @6 -; EG-NEXT: ALU 8, @9, KC0[CB0:0-32], KC1[] +; EG-NEXT: ALU 12, @9, KC0[CB0:0-32], KC1[] ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T6.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD @@ -1645,13 +1645,17 @@ define amdgpu_kernel void @global_zextload_v4i16_to_v4i32(ptr addrspace(1) %out, ; EG-NEXT: ALU clause starting at 8: ; EG-NEXT: MOV * T5.X, KC0[2].Z, ; EG-NEXT: ALU clause starting at 9: -; EG-NEXT: LSHR * T5.W, T5.Y, literal.x, +; EG-NEXT: MOV T2.X, T5.X, +; EG-NEXT: MOV * T3.X, T5.Y, +; EG-NEXT: MOV T0.Y, PV.X, +; EG-NEXT: MOV * T0.Z, PS, +; EG-NEXT: LSHR * T5.W, PV.Z, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT * T5.Z, T5.Y, literal.x, +; EG-NEXT: AND_INT * T5.Z, T0.Z, literal.x, ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) -; EG-NEXT: LSHR * T5.Y, T5.X, literal.x, +; EG-NEXT: LSHR * T5.Y, T0.Y, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T5.X, T5.X, literal.x, +; EG-NEXT: AND_INT T5.X, T0.Y, literal.x, ; EG-NEXT: LSHR * T6.X, KC0[2].Y, literal.y, ; EG-NEXT: 65535(9.183409e-41), 2(2.802597e-45) ; @@ -1659,7 +1663,7 @@ define amdgpu_kernel void @global_zextload_v4i16_to_v4i32(ptr addrspace(1) %out, ; CM: ; %bb.0: ; CM-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] ; CM-NEXT: TEX 0 @6 -; CM-NEXT: ALU 9, @9, KC0[CB0:0-32], KC1[] +; CM-NEXT: ALU 13, @9, KC0[CB0:0-32], KC1[] ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T5, T6.X ; CM-NEXT: CF_END ; CM-NEXT: PAD @@ -1668,13 +1672,17 @@ define amdgpu_kernel void @global_zextload_v4i16_to_v4i32(ptr addrspace(1) %out, ; CM-NEXT: ALU clause starting at 8: ; CM-NEXT: MOV * T5.X, KC0[2].Z, ; CM-NEXT: ALU clause starting at 9: -; CM-NEXT: LSHR * T5.W, T5.Y, literal.x, +; CM-NEXT: MOV * T2.X, T5.X, +; CM-NEXT: MOV T3.X, T5.Y, +; CM-NEXT: MOV * T0.Y, PV.X, +; CM-NEXT: MOV * T0.Z, PV.X, +; CM-NEXT: LSHR * T5.W, PV.Z, literal.x, ; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; CM-NEXT: AND_INT * T5.Z, T5.Y, literal.x, +; CM-NEXT: AND_INT * T5.Z, T0.Z, literal.x, ; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00) -; CM-NEXT: LSHR * T5.Y, T5.X, literal.x, +; CM-NEXT: LSHR * T5.Y, T0.Y, literal.x, ; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; CM-NEXT: AND_INT * T5.X, T5.X, literal.x, +; CM-NEXT: AND_INT * T5.X, T0.Y, literal.x, ; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00) ; CM-NEXT: LSHR * T6.X, KC0[2].Y, literal.x, ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) @@ -1752,8 +1760,8 @@ define amdgpu_kernel void @global_sextload_v4i16_to_v4i32(ptr addrspace(1) %out, ; EG: ; %bb.0: ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 0 @6 -; EG-NEXT: ALU 10, @9, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T5.X, 1 +; EG-NEXT: ALU 14, @9, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T6.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD ; EG-NEXT: Fetch clause starting at 6: @@ -1761,24 +1769,28 @@ define amdgpu_kernel void @global_sextload_v4i16_to_v4i32(ptr addrspace(1) %out, ; EG-NEXT: ALU clause starting at 8: ; EG-NEXT: MOV * T5.X, KC0[2].Z, ; EG-NEXT: ALU clause starting at 9: -; EG-NEXT: BFE_INT * T6.Z, T5.Y, 0.0, literal.x, +; EG-NEXT: MOV T2.X, T5.X, +; EG-NEXT: MOV * T3.X, T5.Y, +; EG-NEXT: MOV T0.Y, PV.X, +; EG-NEXT: MOV * T0.Z, PS, +; EG-NEXT: BFE_INT * T5.Z, PV.Z, 0.0, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: BFE_INT T6.X, T5.X, 0.0, literal.x, -; EG-NEXT: LSHR * T0.W, T5.Y, literal.x, +; EG-NEXT: BFE_INT T5.X, T0.Y, 0.0, literal.x, +; EG-NEXT: LSHR * T0.W, T0.Z, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: BFE_INT T6.W, PV.W, 0.0, literal.x, -; EG-NEXT: LSHR * T0.W, T5.X, literal.x, +; EG-NEXT: BFE_INT T5.W, PV.W, 0.0, literal.x, +; EG-NEXT: LSHR * T0.W, T0.Y, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: LSHR T5.X, KC0[2].Y, literal.x, -; EG-NEXT: BFE_INT * T6.Y, PS, 0.0, literal.y, +; EG-NEXT: LSHR T6.X, KC0[2].Y, literal.x, +; EG-NEXT: BFE_INT * T5.Y, PS, 0.0, literal.y, ; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44) ; ; CM-LABEL: global_sextload_v4i16_to_v4i32: ; CM: ; %bb.0: ; CM-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] ; CM-NEXT: TEX 0 @6 -; CM-NEXT: ALU 10, @9, KC0[CB0:0-32], KC1[] -; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T6, T5.X +; CM-NEXT: ALU 14, @9, KC0[CB0:0-32], KC1[] +; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T5, T6.X ; CM-NEXT: CF_END ; CM-NEXT: PAD ; CM-NEXT: Fetch clause starting at 6: @@ -1786,16 +1798,20 @@ define amdgpu_kernel void @global_sextload_v4i16_to_v4i32(ptr addrspace(1) %out, ; CM-NEXT: ALU clause starting at 8: ; CM-NEXT: MOV * T5.X, KC0[2].Z, ; CM-NEXT: ALU clause starting at 9: -; CM-NEXT: BFE_INT * T6.Z, T5.Y, 0.0, literal.x, +; CM-NEXT: MOV * T2.X, T5.X, +; CM-NEXT: MOV T3.X, T5.Y, +; CM-NEXT: MOV * T0.Y, PV.X, +; CM-NEXT: MOV * T0.Z, PV.X, +; CM-NEXT: BFE_INT * T5.Z, PV.Z, 0.0, literal.x, ; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; CM-NEXT: BFE_INT T6.X, T5.X, 0.0, literal.x, -; CM-NEXT: LSHR * T0.W, T5.Y, literal.x, +; CM-NEXT: BFE_INT T5.X, T0.Y, 0.0, literal.x, +; CM-NEXT: LSHR * T0.W, T0.Z, literal.x, ; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; CM-NEXT: LSHR T0.Z, T5.X, literal.x, -; CM-NEXT: BFE_INT * T6.W, PV.W, 0.0, literal.x, +; CM-NEXT: LSHR T0.Z, T0.Y, literal.x, +; CM-NEXT: BFE_INT * T5.W, PV.W, 0.0, literal.x, ; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; CM-NEXT: LSHR T5.X, KC0[2].Y, literal.x, -; CM-NEXT: BFE_INT * T6.Y, PV.Z, 0.0, literal.y, +; CM-NEXT: LSHR T6.X, KC0[2].Y, literal.x, +; CM-NEXT: BFE_INT * T5.Y, PV.Z, 0.0, literal.y, ; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44) %load = load <4 x i16>, ptr addrspace(1) %in %ext = sext <4 x i16> %load to <4 x i32> @@ -5772,25 +5788,29 @@ define amdgpu_kernel void @global_zextload_v4i16_to_v4i64(ptr addrspace(1) %out, ; EG: ; %bb.0: ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 0 @6 -; EG-NEXT: ALU 14, @9, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T8.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T7.X, 1 +; EG-NEXT: ALU 18, @9, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T8.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T7.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: Fetch clause starting at 6: ; EG-NEXT: VTX_READ_64 T5.XY, T5.X, 0, #1 ; EG-NEXT: ALU clause starting at 8: ; EG-NEXT: MOV * T5.X, KC0[2].Z, ; EG-NEXT: ALU clause starting at 9: -; EG-NEXT: LSHR * T6.Z, T5.Y, literal.x, +; EG-NEXT: MOV T2.X, T5.X, +; EG-NEXT: MOV * T3.X, T5.Y, +; EG-NEXT: MOV T0.Y, PV.X, +; EG-NEXT: MOV * T0.Z, PS, +; EG-NEXT: LSHR * T5.Z, PV.Z, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T6.X, T5.Y, literal.x, -; EG-NEXT: MOV T6.Y, 0.0, -; EG-NEXT: LSHR T5.Z, T5.X, literal.y, -; EG-NEXT: AND_INT * T5.X, T5.X, literal.x, -; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) +; EG-NEXT: AND_INT T5.X, T0.Z, literal.x, ; EG-NEXT: MOV T5.Y, 0.0, -; EG-NEXT: MOV T6.W, 0.0, -; EG-NEXT: MOV * T5.W, 0.0, +; EG-NEXT: LSHR T6.Z, T0.Y, literal.y, +; EG-NEXT: AND_INT * T6.X, T0.Y, literal.x, +; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) +; EG-NEXT: MOV T6.Y, 0.0, +; EG-NEXT: MOV T5.W, 0.0, +; EG-NEXT: MOV * T6.W, 0.0, ; EG-NEXT: LSHR T7.X, KC0[2].Y, literal.x, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, ; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44) @@ -5801,26 +5821,30 @@ define amdgpu_kernel void @global_zextload_v4i16_to_v4i64(ptr addrspace(1) %out, ; CM: ; %bb.0: ; CM-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] ; CM-NEXT: TEX 0 @6 -; CM-NEXT: ALU 16, @9, KC0[CB0:0-32], KC1[] -; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T6, T8.X -; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T5, T7.X +; CM-NEXT: ALU 20, @9, KC0[CB0:0-32], KC1[] +; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T5, T8.X +; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T6, T7.X ; CM-NEXT: CF_END ; CM-NEXT: Fetch clause starting at 6: ; CM-NEXT: VTX_READ_64 T5.XY, T5.X, 0, #1 ; CM-NEXT: ALU clause starting at 8: ; CM-NEXT: MOV * T5.X, KC0[2].Z, ; CM-NEXT: ALU clause starting at 9: -; CM-NEXT: LSHR * T6.Z, T5.X, literal.x, +; CM-NEXT: MOV * T2.X, T5.X, +; CM-NEXT: MOV * T3.X, T5.Y, +; CM-NEXT: MOV T0.Y, PV.X, +; CM-NEXT: MOV * T0.Z, T2.X, +; CM-NEXT: LSHR * T5.Z, PV.Z, literal.x, ; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; CM-NEXT: AND_INT T6.X, T5.X, literal.x, -; CM-NEXT: MOV T6.Y, 0.0, -; CM-NEXT: LSHR * T5.Z, T5.Y, literal.y, -; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) -; CM-NEXT: AND_INT T5.X, T5.Y, literal.x, +; CM-NEXT: AND_INT T5.X, T0.Z, literal.x, ; CM-NEXT: MOV T5.Y, 0.0, -; CM-NEXT: MOV * T6.W, 0.0, -; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00) +; CM-NEXT: LSHR * T6.Z, T0.Y, literal.y, +; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) +; CM-NEXT: AND_INT T6.X, T0.Y, literal.x, +; CM-NEXT: MOV T6.Y, 0.0, ; CM-NEXT: MOV * T5.W, 0.0, +; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00) +; CM-NEXT: MOV * T6.W, 0.0, ; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; CM-NEXT: LSHR * T7.X, PV.W, literal.x, @@ -5921,7 +5945,7 @@ define amdgpu_kernel void @global_sextload_v4i16_to_v4i64(ptr addrspace(1) %out, ; EG: ; %bb.0: ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 0 @6 -; EG-NEXT: ALU 16, @9, KC0[CB0:0-32], KC1[] +; EG-NEXT: ALU 20, @9, KC0[CB0:0-32], KC1[] ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T8.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T6.X, 1 ; EG-NEXT: CF_END @@ -5930,17 +5954,21 @@ define amdgpu_kernel void @global_sextload_v4i16_to_v4i64(ptr addrspace(1) %out, ; EG-NEXT: ALU clause starting at 8: ; EG-NEXT: MOV * T5.X, KC0[2].Z, ; EG-NEXT: ALU clause starting at 9: -; EG-NEXT: ASHR * T5.W, T5.X, literal.x, +; EG-NEXT: MOV T2.X, T5.X, +; EG-NEXT: MOV * T3.X, T5.Y, +; EG-NEXT: MOV T0.Y, PS, +; EG-NEXT: MOV * T0.Z, PV.X, +; EG-NEXT: ASHR * T5.W, PV.Z, literal.x, ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) ; EG-NEXT: LSHR T6.X, KC0[2].Y, literal.x, -; EG-NEXT: ASHR T5.Z, T5.X, literal.y, -; EG-NEXT: ASHR * T7.W, T5.Y, literal.z, +; EG-NEXT: ASHR T5.Z, T0.Z, literal.y, +; EG-NEXT: ASHR * T7.W, T0.Y, literal.z, ; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44) ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) -; EG-NEXT: BFE_INT T5.X, T5.X, 0.0, literal.x, -; EG-NEXT: ASHR * T7.Z, T5.Y, literal.x, +; EG-NEXT: BFE_INT T5.X, T0.Z, 0.0, literal.x, +; EG-NEXT: ASHR * T7.Z, T0.Y, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: BFE_INT T7.X, T5.Y, 0.0, literal.x, +; EG-NEXT: BFE_INT T7.X, T0.Y, 0.0, literal.x, ; EG-NEXT: ASHR T5.Y, PV.X, literal.y, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44) @@ -5952,31 +5980,35 @@ define amdgpu_kernel void @global_sextload_v4i16_to_v4i64(ptr addrspace(1) %out, ; CM: ; %bb.0: ; CM-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] ; CM-NEXT: TEX 0 @6 -; CM-NEXT: ALU 16, @9, KC0[CB0:0-32], KC1[] -; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T5, T8.X -; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T6, T7.X +; CM-NEXT: ALU 20, @9, KC0[CB0:0-32], KC1[] +; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T7, T8.X +; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T5, T6.X ; CM-NEXT: CF_END ; CM-NEXT: Fetch clause starting at 6: ; CM-NEXT: VTX_READ_64 T5.XY, T5.X, 0, #1 ; CM-NEXT: ALU clause starting at 8: ; CM-NEXT: MOV * T5.X, KC0[2].Z, ; CM-NEXT: ALU clause starting at 9: -; CM-NEXT: ADD_INT T0.Z, KC0[2].Y, literal.x, -; CM-NEXT: ASHR * T6.W, T5.Y, literal.y, +; CM-NEXT: MOV * T2.X, T5.X, +; CM-NEXT: MOV T3.X, T5.Y, +; CM-NEXT: MOV * T0.Y, PV.X, +; CM-NEXT: MOV * T0.Z, PV.X, +; CM-NEXT: ADD_INT T1.Z, KC0[2].Y, literal.x, +; CM-NEXT: ASHR * T5.W, PV.Z, literal.y, ; CM-NEXT: 16(2.242078e-44), 31(4.344025e-44) -; CM-NEXT: LSHR T7.X, PV.Z, literal.x, -; CM-NEXT: ASHR T6.Z, T5.Y, literal.y, -; CM-NEXT: ASHR * T5.W, T5.X, literal.z, +; CM-NEXT: LSHR T6.X, PV.Z, literal.x, +; CM-NEXT: ASHR T5.Z, T0.Z, literal.y, +; CM-NEXT: ASHR * T7.W, T0.Y, literal.z, ; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44) ; CM-NEXT: 31(4.344025e-44), 0(0.000000e+00) -; CM-NEXT: BFE_INT T6.X, T5.Y, 0.0, literal.x, -; CM-NEXT: ASHR * T5.Z, T5.X, literal.x, +; CM-NEXT: BFE_INT T5.X, T0.Z, 0.0, literal.x, +; CM-NEXT: ASHR * T7.Z, T0.Y, literal.x, ; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; CM-NEXT: BFE_INT T5.X, T5.X, 0.0, literal.x, -; CM-NEXT: ASHR * T6.Y, PV.X, literal.y, +; CM-NEXT: BFE_INT T7.X, T0.Y, 0.0, literal.x, +; CM-NEXT: ASHR * T5.Y, PV.X, literal.y, ; CM-NEXT: 16(2.242078e-44), 31(4.344025e-44) ; CM-NEXT: LSHR T8.X, KC0[2].Y, literal.x, -; CM-NEXT: ASHR * T5.Y, PV.X, literal.y, +; CM-NEXT: ASHR * T7.Y, PV.X, literal.y, ; CM-NEXT: 2(2.802597e-45), 31(4.344025e-44) %load = load <4 x i16>, ptr addrspace(1) %in %ext = sext <4 x i16> %load to <4 x i64> diff --git a/llvm/test/CodeGen/AMDGPU/shl.ll b/llvm/test/CodeGen/AMDGPU/shl.ll index 20c9544f73bd2..8f99ab780ca93 100644 --- a/llvm/test/CodeGen/AMDGPU/shl.ll +++ b/llvm/test/CodeGen/AMDGPU/shl.ll @@ -681,43 +681,51 @@ define amdgpu_kernel void @shl_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; EG-LABEL: shl_v4i16: ; EG: ; %bb.0: -; EG-NEXT: ALU 3, @8, KC0[CB0:0-32], KC1[] +; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 0 @6 -; EG-NEXT: ALU 42, @12, KC0[CB0:0-32], KC1[] +; EG-NEXT: ALU 51, @11, KC0[CB0:0-32], KC1[] ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T10.XY, T0.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD ; EG-NEXT: Fetch clause starting at 6: ; EG-NEXT: VTX_READ_128 T10.XYZW, T0.X, 0, #1 ; EG-NEXT: ALU clause starting at 8: -; EG-NEXT: MOV T0.Y, T6.X, -; EG-NEXT: LSHL * T0.W, T0.X, literal.x, BS:VEC_120/SCL_212 +; EG-NEXT: LSHL * T0.W, T0.X, literal.x, ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) ; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, -; EG-NEXT: ALU clause starting at 12: -; EG-NEXT: AND_INT * T1.W, T10.Z, literal.x, +; EG-NEXT: ALU clause starting at 11: +; EG-NEXT: MOV T4.X, T10.X, +; EG-NEXT: MOV * T5.X, T10.Y, +; EG-NEXT: MOV T0.X, PV.X, +; EG-NEXT: MOV T0.Y, PS, +; EG-NEXT: MOV * T2.X, T10.Z, +; EG-NEXT: MOV T3.X, T10.W, +; EG-NEXT: MOV * T0.Z, T6.X, +; EG-NEXT: MOV * T1.Y, T2.X, +; EG-NEXT: AND_INT * T1.W, PV.Y, literal.x, ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) -; EG-NEXT: LSHL * T1.W, T10.X, PV.W, +; EG-NEXT: LSHL * T1.W, T0.X, PV.W, ; EG-NEXT: AND_INT T1.W, PV.W, literal.x, -; EG-NEXT: AND_INT * T2.W, T0.Y, literal.y, +; EG-NEXT: AND_INT * T2.W, T0.Z, literal.y, ; EG-NEXT: 65535(9.183409e-41), -65536(nan) ; EG-NEXT: OR_INT * T1.W, PS, PV.W, -; EG-NEXT: MOV * T6.X, PV.W, -; EG-NEXT: MOV T0.X, PV.X, -; EG-NEXT: LSHR T1.W, T10.Z, literal.x, -; EG-NEXT: LSHR * T2.W, T10.X, literal.x, +; EG-NEXT: MOV * T0.Z, T3.X, +; EG-NEXT: MOV * T6.X, T1.W, +; EG-NEXT: MOV T1.Z, PV.X, +; EG-NEXT: LSHR T1.W, T1.Y, literal.x, +; EG-NEXT: LSHR * T2.W, T0.X, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; EG-NEXT: LSHL T1.W, PS, PV.W, -; EG-NEXT: AND_INT * T2.W, PV.X, literal.x, +; EG-NEXT: AND_INT * T2.W, PV.Z, literal.x, ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) ; EG-NEXT: LSHL * T1.W, PV.W, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; EG-NEXT: OR_INT * T1.W, T2.W, PV.W, ; EG-NEXT: MOV T6.X, PV.W, ; EG-NEXT: MOV * T0.X, T7.X, -; EG-NEXT: AND_INT * T1.W, T10.W, literal.x, +; EG-NEXT: AND_INT * T1.W, T0.Z, literal.x, ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) -; EG-NEXT: LSHL T1.W, T10.Y, PV.W, +; EG-NEXT: LSHL T1.W, T0.Y, PV.W, ; EG-NEXT: AND_INT * T2.W, T0.X, literal.x, ; EG-NEXT: -65536(nan), 0(0.000000e+00) ; EG-NEXT: AND_INT * T1.W, PV.W, literal.x, @@ -725,8 +733,8 @@ define amdgpu_kernel void @shl_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in ; EG-NEXT: OR_INT * T1.W, T2.W, PV.W, ; EG-NEXT: MOV * T7.X, PV.W, ; EG-NEXT: MOV T0.X, PV.X, -; EG-NEXT: LSHR T1.W, T10.W, literal.x, -; EG-NEXT: LSHR * T2.W, T10.Y, literal.x, +; EG-NEXT: LSHR T1.W, T0.Z, literal.x, +; EG-NEXT: LSHR * T2.W, T0.Y, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; EG-NEXT: LSHL * T1.W, PS, PV.W, ; EG-NEXT: AND_INT T0.Z, T0.X, literal.x, diff --git a/llvm/test/CodeGen/AMDGPU/sra.ll b/llvm/test/CodeGen/AMDGPU/sra.ll index 7097f58004855..cac917902922e 100644 --- a/llvm/test/CodeGen/AMDGPU/sra.ll +++ b/llvm/test/CodeGen/AMDGPU/sra.ll @@ -323,43 +323,52 @@ define amdgpu_kernel void @ashr_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; EG-LABEL: ashr_v4i16: ; EG: ; %bb.0: -; EG-NEXT: ALU 1, @8, KC0[CB0:0-32], KC1[] +; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 0 @6 -; EG-NEXT: ALU 48, @10, KC0[CB0:0-32], KC1[] +; EG-NEXT: ALU 58, @9, KC0[CB0:0-32], KC1[] ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T10.XY, T9.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD ; EG-NEXT: Fetch clause starting at 6: ; EG-NEXT: VTX_READ_128 T9.XYZW, T9.X, 0, #1 ; EG-NEXT: ALU clause starting at 8: -; EG-NEXT: MOV * T0.Y, T6.X, ; EG-NEXT: MOV * T9.X, KC0[2].Z, -; EG-NEXT: ALU clause starting at 10: -; EG-NEXT: BFE_INT T0.W, T9.X, 0.0, literal.x, -; EG-NEXT: AND_INT * T1.W, T9.Z, literal.y, -; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41) -; EG-NEXT: ASHR * T0.W, PV.W, PS, -; EG-NEXT: AND_INT T0.W, PV.W, literal.x, -; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, +; EG-NEXT: ALU clause starting at 9: +; EG-NEXT: MOV T4.X, T9.X, +; EG-NEXT: MOV * T5.X, T9.Y, +; EG-NEXT: MOV T0.Y, PV.X, +; EG-NEXT: MOV * T0.Z, PS, +; EG-NEXT: MOV T2.X, T9.Z, +; EG-NEXT: MOV * T3.X, T9.W, +; EG-NEXT: MOV * T0.W, T6.X, +; EG-NEXT: MOV T1.Y, T2.X, +; EG-NEXT: BFE_INT * T1.W, T0.Y, 0.0, literal.x, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT * T2.W, PV.Y, literal.x, +; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) +; EG-NEXT: ASHR * T1.W, T1.W, PV.W, +; EG-NEXT: AND_INT T1.W, PV.W, literal.x, +; EG-NEXT: AND_INT * T0.W, T0.W, literal.y, ; EG-NEXT: 65535(9.183409e-41), -65536(nan) ; EG-NEXT: OR_INT * T0.W, PS, PV.W, -; EG-NEXT: MOV * T6.X, PV.W, -; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: LSHR * T0.W, T9.X, literal.x, +; EG-NEXT: MOV * T1.Z, T3.X, +; EG-NEXT: MOV * T6.X, T0.W, +; EG-NEXT: MOV T0.W, PV.X, +; EG-NEXT: LSHR * T1.W, T0.Y, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, -; EG-NEXT: LSHR * T1.W, T9.Z, literal.x, +; EG-NEXT: BFE_INT T1.W, PS, 0.0, literal.x, +; EG-NEXT: LSHR * T2.W, T1.Y, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: ASHR T0.W, PV.W, PS, -; EG-NEXT: AND_INT * T1.W, T0.Y, literal.x, +; EG-NEXT: ASHR T1.W, PV.W, PS, +; EG-NEXT: AND_INT * T0.W, T0.W, literal.x, ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) -; EG-NEXT: LSHL * T0.W, PV.W, literal.x, +; EG-NEXT: LSHL * T1.W, PV.W, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, +; EG-NEXT: OR_INT * T0.W, T0.W, PV.W, ; EG-NEXT: MOV T6.X, PV.W, ; EG-NEXT: MOV T0.Y, T7.X, -; EG-NEXT: BFE_INT T0.W, T9.Y, 0.0, literal.x, -; EG-NEXT: AND_INT * T1.W, T9.W, literal.y, +; EG-NEXT: BFE_INT T0.W, T0.Z, 0.0, literal.x, +; EG-NEXT: AND_INT * T1.W, T1.Z, literal.y, ; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41) ; EG-NEXT: ASHR T0.W, PV.W, PS, ; EG-NEXT: AND_INT * T1.W, PV.Y, literal.x, @@ -369,10 +378,10 @@ define amdgpu_kernel void @ashr_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %i ; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, ; EG-NEXT: MOV * T7.X, PV.W, ; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: LSHR * T0.W, T9.Y, literal.x, +; EG-NEXT: LSHR * T0.W, T0.Z, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, -; EG-NEXT: LSHR * T1.W, T9.W, literal.x, +; EG-NEXT: LSHR * T1.W, T1.Z, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; EG-NEXT: ASHR T0.W, PV.W, PS, ; EG-NEXT: AND_INT * T1.W, T0.Y, literal.x, diff --git a/llvm/test/CodeGen/Mips/o32_cc_byval.ll b/llvm/test/CodeGen/Mips/o32_cc_byval.ll index de6b4dd2ab71d..94c327b5a4665 100644 --- a/llvm/test/CodeGen/Mips/o32_cc_byval.ll +++ b/llvm/test/CodeGen/Mips/o32_cc_byval.ll @@ -69,8 +69,8 @@ define void @f1() nounwind { ; CHECK-NEXT: sw $1, 16($sp) ; CHECK-NEXT: lw $7, 4($18) ; CHECK-NEXT: lw $6, %lo(f1.s1)($17) +; CHECK-NEXT: lbu $5, 40($sp) ; CHECK-NEXT: lw $25, %call16(callee3)($16) -; CHECK-NEXT: addiu $5, $zero, 11 ; CHECK-NEXT: jalr $25 ; CHECK-NEXT: move $gp, $16 ; CHECK-NEXT: lw $16, 48($sp) # 4-byte Folded Reload @@ -234,7 +234,6 @@ define void @f5(i64 %a0, ptr nocapture byval(%struct.S4) %a1) nounwind { ; CHECK-NEXT: addiu $sp, $sp, -32 ; CHECK-NEXT: sw $ra, 28($sp) # 4-byte Folded Spill ; CHECK-NEXT: addu $gp, $2, $25 -; CHECK-NEXT: move $1, $6 ; CHECK-NEXT: sw $7, 44($sp) ; CHECK-NEXT: sw $6, 40($sp) ; CHECK-NEXT: sw $5, 20($sp) @@ -244,7 +243,7 @@ define void @f5(i64 %a0, ptr nocapture byval(%struct.S4) %a1) nounwind { ; CHECK-NEXT: lw $5, 44($sp) ; CHECK-NEXT: lw $25, %call16(f6)($gp) ; CHECK-NEXT: jalr $25 -; CHECK-NEXT: move $4, $1 +; CHECK-NEXT: lw $4, 40($sp) ; CHECK-NEXT: lw $ra, 28($sp) # 4-byte Folded Reload ; CHECK-NEXT: jr $ra ; CHECK-NEXT: addiu $sp, $sp, 32 diff --git a/llvm/test/CodeGen/PowerPC/aix-cc-byval.ll b/llvm/test/CodeGen/PowerPC/aix-cc-byval.ll index 5e7a1bc81916e..508dd633d3750 100644 --- a/llvm/test/CodeGen/PowerPC/aix-cc-byval.ll +++ b/llvm/test/CodeGen/PowerPC/aix-cc-byval.ll @@ -958,33 +958,33 @@ declare i32 @test_byval_homogeneous_float_struct(ptr byval(%struct.F) align 4) ; CHECK-LABEL: name: call_test_byval_homogeneous_float_struct{{.*}} ; 32BIT: ADJCALLSTACKDOWN 56, 0, implicit-def dead $r1, implicit $r1 +; 32BIT-DAG: renamable $r3 = LWZ 0, %stack.0.s :: (load (s32) from %stack.0.s, align 8) ; 32BIT-DAG: renamable $r4 = LWZ 4, %stack.0.s :: (load (s32) from %stack.0.s + 4) ; 32BIT-DAG: renamable $r5 = LWZ 8, %stack.0.s :: (load (s32) from %stack.0.s + 8, align 8) -; 32BIT-DAG: $r3 = LI 0 ; 32BIT-NEXT: BL_NOP , csr_aix32, implicit-def dead $lr, implicit $rm, implicit $r3, implicit $r4, implicit $r5, implicit $r2, implicit-def $r1, implicit-def $r3 ; 32BIT-NEXT: ADJCALLSTACKUP 56, 0, implicit-def dead $r1, implicit $r1 ; CHECKASM-LABEL: .call_test_byval_homogeneous_float_struct: ; ASM32: stwu 1, -80(1) +; ASM32-DAG: lwz 3, 64(1) ; ASM32-DAG: lwz 4, 68(1) ; ASM32-DAG: lwz 5, 72(1) -; ASM32-DAG: stw 3, 64(1) ; ASM32-NEXT: bl .test_byval_homogeneous_float_struct[PR] ; ASM32-NEXT: nop ; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings. ; 64BIT: ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1 -; 64BIT: renamable $x3 = LWZ8 8, %stack.0.s :: (load (s32) from %stack.0.s + 8, align 8) -; 64BIT-NEXT: renamable $x4 = RLDICR killed renamable $x3, 32, 31 -; 64BIT-NEXT: $x3 = LI8 0 +; 64BIT-DAG: renamable $x3 = LD 0, %stack.0.s :: (load (s64) from %stack.0.s) +; 64BIT-DAG: renamable $x4 = LWZ8 8, %stack.0.s :: (load (s32) from %stack.0.s + 8, align 8) +; 64BIT-DAG: renamable $x4 = RLDICR killed renamable $x4, 32, 31 ; 64BIT-NEXT: BL8_NOP , csr_ppc64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $x4, implicit $x2, implicit-def $r1, implicit-def $x3 ; 64BIT-NEXT: ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1 ; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings. ; ASM64: stdu 1, -128(1) -; ASM64: lwz 3, 120(1) -; ASM64-NEXT: sldi 4, 3, 32 -; ASM64-NEXT: li 3, 0 +; ASM64-DAG: ld 3, 112(1) +; ASM64-DAG: lwz 4, 120(1) +; ASM64-DAG: sldi 4, 4, 32 ; ASM64-NEXT: bl .test_byval_homogeneous_float_struct[PR] ; ASM64-NEXT: nop diff --git a/llvm/test/CodeGen/PowerPC/byval-lhs.ll b/llvm/test/CodeGen/PowerPC/byval-lhs.ll index aef374a5edbb7..80de18133dd4d 100644 --- a/llvm/test/CodeGen/PowerPC/byval-lhs.ll +++ b/llvm/test/CodeGen/PowerPC/byval-lhs.ll @@ -17,6 +17,7 @@ define void @bar1(i64 %a) nounwind { ; LE-NEXT: stdu r1, -48(r1) ; LE-NEXT: std r0, 64(r1) ; LE-NEXT: std r3, 40(r1) +; LE-NEXT: ld r3, 40(r1) ; LE-NEXT: bl f0 ; LE-NEXT: nop ; LE-NEXT: addi r1, r1, 48 @@ -30,6 +31,7 @@ define void @bar1(i64 %a) nounwind { ; AIX-NEXT: stdu r1, -128(r1) ; AIX-NEXT: std r0, 144(r1) ; AIX-NEXT: std r3, 120(r1) +; AIX-NEXT: ld r3, 120(r1) ; AIX-NEXT: bl .f0[PR] ; AIX-NEXT: nop ; AIX-NEXT: addi r1, r1, 128 @@ -47,10 +49,11 @@ define void @bar2(i64 %a) nounwind { ; LE: # %bb.0: ; LE-NEXT: mflr r0 ; LE-NEXT: stdu r1, -48(r1) -; LE-NEXT: mr r4, r3 ; LE-NEXT: std r0, 64(r1) ; LE-NEXT: std r3, 32(r1) ; LE-NEXT: std r3, 40(r1) +; LE-NEXT: ld r4, 40(r1) +; LE-NEXT: ld r3, 32(r1) ; LE-NEXT: bl f1 ; LE-NEXT: nop ; LE-NEXT: addi r1, r1, 48 @@ -62,10 +65,11 @@ define void @bar2(i64 %a) nounwind { ; AIX: # %bb.0: ; AIX-NEXT: mflr r0 ; AIX-NEXT: stdu r1, -128(r1) -; AIX-NEXT: mr r4, r3 ; AIX-NEXT: std r0, 144(r1) ; AIX-NEXT: std r3, 112(r1) ; AIX-NEXT: std r3, 120(r1) +; AIX-NEXT: ld r4, 120(r1) +; AIX-NEXT: ld r3, 112(r1) ; AIX-NEXT: bl .f1[PR] ; AIX-NEXT: nop ; AIX-NEXT: addi r1, r1, 128 diff --git a/llvm/test/CodeGen/PowerPC/ppc64-byval-larger-struct.ll b/llvm/test/CodeGen/PowerPC/ppc64-byval-larger-struct.ll index 39b06619063df..429b8774f6ec3 100644 --- a/llvm/test/CodeGen/PowerPC/ppc64-byval-larger-struct.ll +++ b/llvm/test/CodeGen/PowerPC/ppc64-byval-larger-struct.ll @@ -184,6 +184,7 @@ define signext i8 @caller_9_callee_9(ptr nocapture readonly byval([9 x i8]) %dat ; P8LE-NEXT: stdx r3, 0, r5 ; P8LE-NEXT: stb r4, 79(r1) ; P8LE-NEXT: lbz r4, 56(r1) +; P8LE-NEXT: ld r3, 48(r1) ; P8LE-NEXT: bl callee_9 ; P8LE-NEXT: nop ; P8LE-NEXT: li r3, 0 @@ -203,6 +204,7 @@ define signext i8 @caller_9_callee_9(ptr nocapture readonly byval([9 x i8]) %dat ; P9LE-NEXT: std r3, 48(r1) ; P9LE-NEXT: stdx r3, 0, r4 ; P9LE-NEXT: lbz r4, 56(r1) +; P9LE-NEXT: ld r3, 48(r1) ; P9LE-NEXT: stb r5, 79(r1) ; P9LE-NEXT: bl callee_9 ; P9LE-NEXT: nop @@ -223,6 +225,7 @@ define signext i8 @caller_9_callee_9(ptr nocapture readonly byval([9 x i8]) %dat ; P10LE-NEXT: lbz r5, 56(r1) ; P10LE-NEXT: stdx r3, 0, r4 ; P10LE-NEXT: lbz r4, 56(r1) +; P10LE-NEXT: ld r3, 48(r1) ; P10LE-NEXT: stb r5, 79(r1) ; P10LE-NEXT: bl callee_9@notoc ; P10LE-NEXT: li r3, 0 @@ -243,6 +246,7 @@ define signext i8 @caller_9_callee_9(ptr nocapture readonly byval([9 x i8]) %dat ; P8BE-NEXT: stdx r3, 0, r5 ; P8BE-NEXT: stb r4, 143(r1) ; P8BE-NEXT: lbz r4, 200(r1) +; P8BE-NEXT: ld r3, 192(r1) ; P8BE-NEXT: bl callee_9 ; P8BE-NEXT: nop ; P8BE-NEXT: li r3, 0 @@ -262,6 +266,7 @@ define signext i8 @caller_9_callee_9(ptr nocapture readonly byval([9 x i8]) %dat ; P9BE-NEXT: std r3, 192(r1) ; P9BE-NEXT: stdx r3, 0, r4 ; P9BE-NEXT: lbz r4, 200(r1) +; P9BE-NEXT: ld r3, 192(r1) ; P9BE-NEXT: stb r5, 143(r1) ; P9BE-NEXT: bl callee_9 ; P9BE-NEXT: nop @@ -282,6 +287,7 @@ define signext i8 @caller_9_callee_9(ptr nocapture readonly byval([9 x i8]) %dat ; P10BE-NEXT: lbz r5, 200(r1) ; P10BE-NEXT: stdx r3, 0, r4 ; P10BE-NEXT: lbz r4, 200(r1) +; P10BE-NEXT: ld r3, 192(r1) ; P10BE-NEXT: stb r5, 143(r1) ; P10BE-NEXT: bl callee_9 ; P10BE-NEXT: nop diff --git a/llvm/test/CodeGen/X86/fastcc-byval.ll b/llvm/test/CodeGen/X86/fastcc-byval.ll index 920291a73ecd6..aee07caf4efb5 100644 --- a/llvm/test/CodeGen/X86/fastcc-byval.ll +++ b/llvm/test/CodeGen/X86/fastcc-byval.ll @@ -16,7 +16,8 @@ define fastcc i32 @bar() nounwind { ; CHECK: ## %bb.0: ; CHECK-NEXT: subl $12, %esp ; CHECK-NEXT: movl $1, 8(%esp) -; CHECK-NEXT: movl $1, (%esp) +; CHECK-NEXT: movl 8(%esp), %eax +; CHECK-NEXT: movl %eax, (%esp) ; CHECK-NEXT: calll _foo ; CHECK-NEXT: movl 8(%esp), %eax ; CHECK-NEXT: addl $12, %esp