diff --git a/llvm/lib/Target/AMDGPU/AMDGPUExportClustering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUExportClustering.cpp
index cbc248fbd9c88..25c82ed61fc2e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUExportClustering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUExportClustering.cpp
@@ -81,6 +81,32 @@ static void buildCluster(ArrayRef<SUnit *> Exports, ScheduleDAGInstrs *DAG) {
   }
 }
 
+static void removeExportDependencies(ScheduleDAGInstrs *DAG, SUnit &SU) {
+  SmallVector<SDep, 2> ToAdd, ToRemove;
+
+  for (const SDep &Pred : SU.Preds) {
+    SUnit *PredSU = Pred.getSUnit();
+    if (Pred.isBarrier() && isExport(*PredSU)) {
+      ToRemove.push_back(Pred);
+      if (isExport(SU))
+        continue;
+
+      // If we remove a barrier we need to copy dependencies
+      // from the predecessor to maintain order.
+      for (const SDep &ExportPred : PredSU->Preds) {
+        SUnit *ExportPredSU = ExportPred.getSUnit();
+        if (ExportPred.isBarrier() && !isExport(*ExportPredSU))
+          ToAdd.push_back(SDep(ExportPredSU, SDep::Barrier));
+      }
+    }
+  }
+
+  for (SDep Pred : ToRemove)
+    SU.removePred(Pred);
+  for (SDep Pred : ToAdd)
+    DAG->addEdge(&SU, Pred);
+}
+
 void ExportClustering::apply(ScheduleDAGInstrs *DAG) {
   const SIInstrInfo *TII = static_cast<const SIInstrInfo *>(DAG->TII);
 
@@ -92,20 +118,18 @@ void ExportClustering::apply(ScheduleDAGInstrs *DAG) {
   // on exports.  Edges will be added later to order the exports.
   unsigned PosCount = 0;
   for (SUnit &SU : DAG->SUnits) {
-    if (isExport(SU)) {
-      Chain.push_back(&SU);
-      if (isPositionExport(TII, &SU))
-        PosCount++;
-    }
+    if (!isExport(SU))
+      continue;
 
-    SmallVector<SDep, 2> ToRemove;
-    for (const SDep &Pred : SU.Preds) {
-      SUnit *PredSU = Pred.getSUnit();
-      if (Pred.isBarrier() && isExport(*PredSU))
-        ToRemove.push_back(Pred);
-    }
-    for (SDep Pred : ToRemove)
-      SU.removePred(Pred);
+    Chain.push_back(&SU);
+    if (isPositionExport(TII, &SU))
+      PosCount++;
+
+    removeExportDependencies(DAG, SU);
+
+    SmallVector<SDep, 4> Succs(SU.Succs);
+    for (SDep Succ : Succs)
+      removeExportDependencies(DAG, *Succ.getSUnit());
   }
 
   // Apply clustering if there are multiple exports
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.ll
index a9e1f1859a2e5..9a62ca5db0891 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.ll
@@ -602,6 +602,27 @@ define amdgpu_kernel void @test_export_pos_before_param_across_load(i32 %idx) #0
   ret void
 }
 
+; GCN-LABEL: {{^}}test_export_across_store_load:
+; GCN: buffer_store
+; GCN: buffer_load
+; GCN: exp pos0
+; GCN: exp param0
+; GCN: exp param1
+define amdgpu_kernel void @test_export_across_store_load(i32 %idx, float %v) #0 {
+  %data0 = alloca <4 x float>, align 8, addrspace(5)
+  %data1 = alloca <4 x float>, align 8, addrspace(5)
+  %cmp = icmp eq i32 %idx, 1
+  %data = select i1 %cmp, <4 x float> addrspace(5)* %data0, <4 x float> addrspace(5)* %data1
+  %sptr = getelementptr inbounds <4 x float>, <4 x float> addrspace(5)* %data, i32 0, i32 0
+  store float %v, float addrspace(5)* %sptr, align 8
+  call void @llvm.amdgcn.exp.f32(i32 12, i32 15, float 0.0, float 0.0, float 0.0, float 1.0, i1 true, i1 false)
+  %ptr0 = getelementptr inbounds <4 x float>, <4 x float> addrspace(5)* %data0, i32 0, i32 0
+  %load0 = load float, float addrspace(5)* %ptr0, align 8
+  call void @llvm.amdgcn.exp.f32(i32 32, i32 15, float %load0, float 0.0, float 1.0, float 0.0, i1 false, i1 false)
+  call void @llvm.amdgcn.exp.f32(i32 33, i32 15, float %load0, float 0.0, float 1.0, float 0.0, i1 false, i1 false)
+  ret void
+}
+
 attributes #0 = { nounwind }
 attributes #1 = { nounwind inaccessiblememonly }
 attributes #2 = { nounwind readnone }