From e2cc091a7d01fc3a3d226e206f00f029c9e3e079 Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nadav256@gmail.com>
Date: Mon, 10 Jan 2022 12:51:37 -0800
Subject: [PATCH] Fix a missed opportunity to merge stores.

This commit fixes a missed opportunity in merging consecutive stores.
The code that searches for stores skipped the case of stores that
directly connect to the root. The comment above the implementation lists
this case but the code did not handle it. I found this pattern when
looking into the shared_ptr destructor. GCC generates the right
sequence. Here is a small repo:

    int foo(int* buff) {
        buff[0] = 0;
        int x = buff[1];
        buff[1] = 0;
        return x;
    }

Differential Revision: https://reviews.llvm.org/D116895
---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp |  4 ++
 .../Hexagon/store-widen-aliased-load.ll       |  2 +-
 llvm/test/CodeGen/PowerPC/mma-acc-memops.ll   | 64 ++++++++-----------
 .../CodeGen/X86/MergeConsecutiveStores.ll     | 28 ++++++++
 4 files changed, 60 insertions(+), 38 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 9ac9376646423..7e474db125e40 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -17492,6 +17492,10 @@ void DAGCombiner::getStoreMergeCandidates(
         for (auto I2 = (*I)->use_begin(), E2 = (*I)->use_end(); I2 != E2; ++I2)
           TryToAddCandidate(I2);
       }
+      // Check stores that depend on the root (e.g. Store 3 in the chart above).
+      if (I.getOperandNo() == 0 && isa<StoreSDNode>(*I)) {
+        TryToAddCandidate(I);
+      }
     }
   } else {
     for (auto I = RootNode->use_begin(), E = RootNode->use_end();
diff --git a/llvm/test/CodeGen/Hexagon/store-widen-aliased-load.ll b/llvm/test/CodeGen/Hexagon/store-widen-aliased-load.ll
index a8380306565ef..d0bd6850bb825 100644
--- a/llvm/test/CodeGen/Hexagon/store-widen-aliased-load.ll
+++ b/llvm/test/CodeGen/Hexagon/store-widen-aliased-load.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=hexagon < %s | FileCheck %s
+; RUN: llc -march=hexagon --combiner-store-merging=false < %s | FileCheck %s
 ; CHECK-NOT: memh
 ; Check that store widening does not merge the two stores.
 
diff --git a/llvm/test/CodeGen/PowerPC/mma-acc-memops.ll b/llvm/test/CodeGen/PowerPC/mma-acc-memops.ll
index 5c8c84be07021..5915862f4cbdb 100644
--- a/llvm/test/CodeGen/PowerPC/mma-acc-memops.ll
+++ b/llvm/test/CodeGen/PowerPC/mma-acc-memops.ll
@@ -562,69 +562,59 @@ define dso_local void @testUnalignedLdStPair() {
 ; LE-PWR9-LABEL: testUnalignedLdStPair:
 ; LE-PWR9:       # %bb.0: # %entry
 ; LE-PWR9-NEXT:    addis r3, r2, g@toc@ha
-; LE-PWR9-NEXT:    li r6, 19
 ; LE-PWR9-NEXT:    li r4, 11
-; LE-PWR9-NEXT:    li r5, 35
-; LE-PWR9-NEXT:    li r7, 27
 ; LE-PWR9-NEXT:    addi r3, r3, g@toc@l
-; LE-PWR9-NEXT:    lxvx vs0, r3, r6
-; LE-PWR9-NEXT:    ldx r4, r3, r4
-; LE-PWR9-NEXT:    ldx r5, r3, r5
-; LE-PWR9-NEXT:    stdx r4, r3, r6
-; LE-PWR9-NEXT:    stxvx vs0, r3, r7
-; LE-PWR9-NEXT:    li r7, 43
-; LE-PWR9-NEXT:    stdx r5, r3, r7
+; LE-PWR9-NEXT:    lxvx vs0, r3, r4
+; LE-PWR9-NEXT:    li r4, 27
+; LE-PWR9-NEXT:    lxvx vs1, r3, r4
+; LE-PWR9-NEXT:    li r4, 35
+; LE-PWR9-NEXT:    stxvx vs1, r3, r4
+; LE-PWR9-NEXT:    li r4, 19
+; LE-PWR9-NEXT:    stxvx vs0, r3, r4
 ; LE-PWR9-NEXT:    blr
 ;
 ; LE-PWR8-LABEL: testUnalignedLdStPair:
 ; LE-PWR8:       # %bb.0: # %entry
 ; LE-PWR8-NEXT:    addis r3, r2, g@toc@ha
-; LE-PWR8-NEXT:    li r4, 19
+; LE-PWR8-NEXT:    li r4, 27
 ; LE-PWR8-NEXT:    li r5, 11
-; LE-PWR8-NEXT:    li r6, 35
-; LE-PWR8-NEXT:    li r7, 43
-; LE-PWR8-NEXT:    li r8, 27
+; LE-PWR8-NEXT:    li r6, 19
+; LE-PWR8-NEXT:    li r8, 35
 ; LE-PWR8-NEXT:    addi r3, r3, g@toc@l
 ; LE-PWR8-NEXT:    lxvd2x vs0, r3, r4
 ; LE-PWR8-NEXT:    ldx r5, r3, r5
-; LE-PWR8-NEXT:    ldx r6, r3, r6
-; LE-PWR8-NEXT:    stdx r6, r3, r7
-; LE-PWR8-NEXT:    stdx r5, r3, r4
+; LE-PWR8-NEXT:    ldx r7, r3, r6
+; LE-PWR8-NEXT:    stdx r7, r3, r4
+; LE-PWR8-NEXT:    stdx r5, r3, r6
 ; LE-PWR8-NEXT:    stxvd2x vs0, r3, r8
 ; LE-PWR8-NEXT:    blr
 ;
 ; BE-PWR9-LABEL: testUnalignedLdStPair:
 ; BE-PWR9:       # %bb.0: # %entry
 ; BE-PWR9-NEXT:    addis r3, r2, g@toc@ha
-; BE-PWR9-NEXT:    li r6, 19
 ; BE-PWR9-NEXT:    li r4, 11
-; BE-PWR9-NEXT:    li r5, 35
-; BE-PWR9-NEXT:    li r7, 27
 ; BE-PWR9-NEXT:    addi r3, r3, g@toc@l
-; BE-PWR9-NEXT:    lxvx vs0, r3, r6
-; BE-PWR9-NEXT:    ldx r4, r3, r4
-; BE-PWR9-NEXT:    ldx r5, r3, r5
-; BE-PWR9-NEXT:    stdx r4, r3, r6
-; BE-PWR9-NEXT:    stxvx vs0, r3, r7
-; BE-PWR9-NEXT:    li r7, 43
-; BE-PWR9-NEXT:    stdx r5, r3, r7
+; BE-PWR9-NEXT:    lxvx vs0, r3, r4
+; BE-PWR9-NEXT:    li r4, 27
+; BE-PWR9-NEXT:    lxvx vs1, r3, r4
+; BE-PWR9-NEXT:    li r4, 35
+; BE-PWR9-NEXT:    stxvx vs1, r3, r4
+; BE-PWR9-NEXT:    li r4, 19
+; BE-PWR9-NEXT:    stxvx vs0, r3, r4
 ; BE-PWR9-NEXT:    blr
 ;
 ; BE-PWR8-LABEL: testUnalignedLdStPair:
 ; BE-PWR8:       # %bb.0: # %entry
 ; BE-PWR8-NEXT:    addis r3, r2, g@toc@ha
-; BE-PWR8-NEXT:    li r4, 19
-; BE-PWR8-NEXT:    li r5, 11
-; BE-PWR8-NEXT:    li r6, 35
-; BE-PWR8-NEXT:    li r7, 27
+; BE-PWR8-NEXT:    li r4, 11
+; BE-PWR8-NEXT:    li r5, 27
 ; BE-PWR8-NEXT:    addi r3, r3, g@toc@l
 ; BE-PWR8-NEXT:    lxvd2x vs0, r3, r4
-; BE-PWR8-NEXT:    ldx r5, r3, r5
-; BE-PWR8-NEXT:    ldx r6, r3, r6
-; BE-PWR8-NEXT:    stxvd2x vs0, r3, r7
-; BE-PWR8-NEXT:    li r7, 43
-; BE-PWR8-NEXT:    stdx r5, r3, r4
-; BE-PWR8-NEXT:    stdx r6, r3, r7
+; BE-PWR8-NEXT:    lxvd2x vs1, r3, r5
+; BE-PWR8-NEXT:    li r4, 35
+; BE-PWR8-NEXT:    li r5, 19
+; BE-PWR8-NEXT:    stxvd2x vs1, r3, r4
+; BE-PWR8-NEXT:    stxvd2x vs0, r3, r5
 ; BE-PWR8-NEXT:    blr
 entry:
   %0 = bitcast <256 x i1>* @g to i8*
diff --git a/llvm/test/CodeGen/X86/MergeConsecutiveStores.ll b/llvm/test/CodeGen/X86/MergeConsecutiveStores.ll
index de74c8055834a..106a4d75418e8 100644
--- a/llvm/test/CodeGen/X86/MergeConsecutiveStores.ll
+++ b/llvm/test/CodeGen/X86/MergeConsecutiveStores.ll
@@ -920,3 +920,31 @@ define void @merge_heterogeneous(%struct.C* nocapture %p, %struct.C* nocapture %
   ret void
 }
 
+define i32 @merge_store_load_store_seq(i32* %buff) {
+entry:
+; CHECK-LABEL: merge_store_load_store_seq:
+; CHECK:      movl 4(%rdi), %eax
+; CHECK-NEXT: movq $0, (%rdi)
+; CHECK-NEXT: retq
+
+  store i32 0, i32* %buff, align 4
+  %arrayidx1 = getelementptr inbounds i32, i32* %buff, i64 1
+  %0 = load i32, i32* %arrayidx1, align 4
+  store i32 0, i32* %arrayidx1, align 4
+  ret i32 %0
+}
+
+define i32 @merge_store_alias(i32* %buff, i32* %other) {
+entry:
+; CHECK-LABEL: merge_store_alias:
+; CHECK:  movl $0, (%rdi)
+; CHECK-NEXT:  movl (%rsi), %eax
+; CHECK-NEXT:  movl $0, 4(%rdi)
+; CHECK-NEXT:  retq
+
+  store i32 0, i32* %buff, align 4
+  %arrayidx1 = getelementptr inbounds i32, i32* %buff, i64 1
+  %0 = load i32, i32* %other, align 4
+  store i32 0, i32* %arrayidx1, align 4
+  ret i32 %0
+}