diff --git a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
index a297db17883583..7bded0005a517d 100644
--- a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
@@ -198,10 +198,10 @@ static void RemoveUnusedGlue(SDNode *N, SelectionDAG *DAG) {
 /// outputs to ensure they are scheduled together and in order. This
 /// optimization may benefit some targets by improving cache locality.
 void ScheduleDAGSDNodes::ClusterNeighboringLoads(SDNode *Node) {
-  SDNode *Chain = nullptr;
+  SDValue Chain;
   unsigned NumOps = Node->getNumOperands();
   if (Node->getOperand(NumOps-1).getValueType() == MVT::Other)
-    Chain = Node->getOperand(NumOps-1).getNode();
+    Chain = Node->getOperand(NumOps-1);
   if (!Chain)
     return;
 
@@ -234,6 +234,9 @@ void ScheduleDAGSDNodes::ClusterNeighboringLoads(SDNode *Node) {
   unsigned UseCount = 0;
   for (SDNode::use_iterator I = Chain->use_begin(), E = Chain->use_end();
        I != E && UseCount < 100; ++I, ++UseCount) {
+    if (I.getUse().getResNo() != Chain.getResNo())
+      continue;
+
     SDNode *User = *I;
     if (User == Node || !Visited.insert(User).second)
       continue;
diff --git a/llvm/test/CodeGen/AMDGPU/bug-sdag-scheduler-cycle.ll b/llvm/test/CodeGen/AMDGPU/bug-sdag-scheduler-cycle.ll
new file mode 100644
index 00000000000000..50ba7e19f46e07
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/bug-sdag-scheduler-cycle.ll
@@ -0,0 +1,27 @@
+; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+
+; This used to cause a circular chain dependency during
+; SelectionDAG instruction scheduling.
+
+; CHECK-LABEL: {{^}}_amdgpu_gs_main:
+; CHECK: ds_read_b32
+; CHECK: ds_read_b32
+; CHECK: ds_read_b32
+; CHECK: ds_read_b32
+define amdgpu_gs float @_amdgpu_gs_main(i8 addrspace(3)* %arg0, i8 addrspace(3)* %arg1, i8 addrspace(3)* %arg2) #0 {
+  %tmp0 = bitcast i8 addrspace(3)* %arg0 to i32 addrspace(3)* addrspace(3)*
+  %tmp = load volatile i32 addrspace(3)*, i32 addrspace(3)* addrspace(3)* %tmp0, align 4
+
+  %tmp3 = load volatile i32, i32 addrspace(3)* %tmp, align 4
+
+  %tmp4a = bitcast i8 addrspace(3)* %arg1 to i32 addrspace(3)*
+  %tmp4 = load volatile i32, i32 addrspace(3)* %tmp4a, align 4
+
+  %tmp7a = getelementptr i32, i32 addrspace(3)* %tmp, i32 8
+  %tmp8 = load volatile i32, i32 addrspace(3)* %tmp7a, align 4
+
+  %tmp9 = add i32 %tmp3, %tmp8
+  %tmp10 = add i32 %tmp9, %tmp4
+  %tmp14 = bitcast i32 %tmp10 to float
+  ret float %tmp14
+}