diff --git a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp index a297db17883583..7bded0005a517d 100644 --- a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp @@ -198,10 +198,10 @@ static void RemoveUnusedGlue(SDNode *N, SelectionDAG *DAG) { /// outputs to ensure they are scheduled together and in order. This /// optimization may benefit some targets by improving cache locality. void ScheduleDAGSDNodes::ClusterNeighboringLoads(SDNode *Node) { - SDNode *Chain = nullptr; + SDValue Chain; unsigned NumOps = Node->getNumOperands(); if (Node->getOperand(NumOps-1).getValueType() == MVT::Other) - Chain = Node->getOperand(NumOps-1).getNode(); + Chain = Node->getOperand(NumOps-1); if (!Chain) return; @@ -234,6 +234,9 @@ void ScheduleDAGSDNodes::ClusterNeighboringLoads(SDNode *Node) { unsigned UseCount = 0; for (SDNode::use_iterator I = Chain->use_begin(), E = Chain->use_end(); I != E && UseCount < 100; ++I, ++UseCount) { + if (I.getUse().getResNo() != Chain.getResNo()) + continue; + SDNode *User = *I; if (User == Node || !Visited.insert(User).second) continue; diff --git a/llvm/test/CodeGen/AMDGPU/bug-sdag-scheduler-cycle.ll b/llvm/test/CodeGen/AMDGPU/bug-sdag-scheduler-cycle.ll new file mode 100644 index 00000000000000..50ba7e19f46e07 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/bug-sdag-scheduler-cycle.ll @@ -0,0 +1,27 @@ +; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK + +; This used to cause a circular chain dependency during +; SelectionDAG instruction scheduling. + +; CHECK-LABEL: {{^}}_amdgpu_gs_main: +; CHECK: ds_read_b32 +; CHECK: ds_read_b32 +; CHECK: ds_read_b32 +; CHECK: ds_read_b32 +define amdgpu_gs float @_amdgpu_gs_main(i8 addrspace(3)* %arg0, i8 addrspace(3)* %arg1, i8 addrspace(3)* %arg2) #0 { + %tmp0 = bitcast i8 addrspace(3)* %arg0 to i32 addrspace(3)* addrspace(3)* + %tmp = load volatile i32 addrspace(3)*, i32 addrspace(3)* addrspace(3)* %tmp0, align 4 + + %tmp3 = load volatile i32, i32 addrspace(3)* %tmp, align 4 + + %tmp4a = bitcast i8 addrspace(3)* %arg1 to i32 addrspace(3)* + %tmp4 = load volatile i32, i32 addrspace(3)* %tmp4a, align 4 + + %tmp7a = getelementptr i32, i32 addrspace(3)* %tmp, i32 8 + %tmp8 = load volatile i32, i32 addrspace(3)* %tmp7a, align 4 + + %tmp9 = add i32 %tmp3, %tmp8 + %tmp10 = add i32 %tmp9, %tmp4 + %tmp14 = bitcast i32 %tmp10 to float + ret float %tmp14 +}