Fix a missed opportunity to merge stores.

This commit fixes a missed opportunity in merging consecutive stores. The code that searches for stores skipped the case of stores that directly connect to the root. The comment above the implementation lists this case but the code did not handle it. I found this pattern when looking into the shared_ptr destructor. GCC generates the right sequence. Here is a small repo: int foo(int* buff) { buff[0] = 0; int x = buff[1]; buff[1] = 0; return x; } Differential Revision: https://reviews.llvm.org/D116895
llvm · Jan 10, 2022 · e2cc091 · e2cc091
1 parent 8bed953
commit e2cc091
Show file tree

Hide file tree

Showing 4 changed files with 60 additions and 38 deletions.
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -17492,6 +17492,10 @@ void DAGCombiner::getStoreMergeCandidates(
         for (auto I2 = (*I)->use_begin(), E2 = (*I)->use_end(); I2 != E2; ++I2)
           TryToAddCandidate(I2);
       }
+      // Check stores that depend on the root (e.g. Store 3 in the chart above).
+      if (I.getOperandNo() == 0 && isa<StoreSDNode>(*I)) {
+        TryToAddCandidate(I);
+      }
     }
   } else {
     for (auto I = RootNode->use_begin(), E = RootNode->use_end();

diff --git a/llvm/test/CodeGen/Hexagon/store-widen-aliased-load.ll b/llvm/test/CodeGen/Hexagon/store-widen-aliased-load.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=hexagon < %s | FileCheck %s
+; RUN: llc -march=hexagon --combiner-store-merging=false < %s | FileCheck %s
 ; CHECK-NOT: memh
 ; Check that store widening does not merge the two stores.
 

diff --git a/llvm/test/CodeGen/PowerPC/mma-acc-memops.ll b/llvm/test/CodeGen/PowerPC/mma-acc-memops.ll
@@ -562,69 +562,59 @@ define dso_local void @testUnalignedLdStPair() {
 ; LE-PWR9-LABEL: testUnalignedLdStPair:
 ; LE-PWR9:       # %bb.0: # %entry
 ; LE-PWR9-NEXT:    addis r3, r2, g@toc@ha
-; LE-PWR9-NEXT:    li r6, 19
 ; LE-PWR9-NEXT:    li r4, 11
-; LE-PWR9-NEXT:    li r5, 35
-; LE-PWR9-NEXT:    li r7, 27
 ; LE-PWR9-NEXT:    addi r3, r3, g@toc@l
-; LE-PWR9-NEXT:    lxvx vs0, r3, r6
-; LE-PWR9-NEXT:    ldx r4, r3, r4
-; LE-PWR9-NEXT:    ldx r5, r3, r5
-; LE-PWR9-NEXT:    stdx r4, r3, r6
-; LE-PWR9-NEXT:    stxvx vs0, r3, r7
-; LE-PWR9-NEXT:    li r7, 43
-; LE-PWR9-NEXT:    stdx r5, r3, r7
+; LE-PWR9-NEXT:    lxvx vs0, r3, r4
+; LE-PWR9-NEXT:    li r4, 27
+; LE-PWR9-NEXT:    lxvx vs1, r3, r4
+; LE-PWR9-NEXT:    li r4, 35
+; LE-PWR9-NEXT:    stxvx vs1, r3, r4
+; LE-PWR9-NEXT:    li r4, 19
+; LE-PWR9-NEXT:    stxvx vs0, r3, r4
 ; LE-PWR9-NEXT:    blr
 ;
 ; LE-PWR8-LABEL: testUnalignedLdStPair:
 ; LE-PWR8:       # %bb.0: # %entry
 ; LE-PWR8-NEXT:    addis r3, r2, g@toc@ha
-; LE-PWR8-NEXT:    li r4, 19
+; LE-PWR8-NEXT:    li r4, 27
 ; LE-PWR8-NEXT:    li r5, 11
-; LE-PWR8-NEXT:    li r6, 35
-; LE-PWR8-NEXT:    li r7, 43
-; LE-PWR8-NEXT:    li r8, 27
+; LE-PWR8-NEXT:    li r6, 19
+; LE-PWR8-NEXT:    li r8, 35
 ; LE-PWR8-NEXT:    addi r3, r3, g@toc@l
 ; LE-PWR8-NEXT:    lxvd2x vs0, r3, r4
 ; LE-PWR8-NEXT:    ldx r5, r3, r5
-; LE-PWR8-NEXT:    ldx r6, r3, r6
-; LE-PWR8-NEXT:    stdx r6, r3, r7
-; LE-PWR8-NEXT:    stdx r5, r3, r4
+; LE-PWR8-NEXT:    ldx r7, r3, r6
+; LE-PWR8-NEXT:    stdx r7, r3, r4
+; LE-PWR8-NEXT:    stdx r5, r3, r6
 ; LE-PWR8-NEXT:    stxvd2x vs0, r3, r8
 ; LE-PWR8-NEXT:    blr
 ;
 ; BE-PWR9-LABEL: testUnalignedLdStPair:
 ; BE-PWR9:       # %bb.0: # %entry
 ; BE-PWR9-NEXT:    addis r3, r2, g@toc@ha
-; BE-PWR9-NEXT:    li r6, 19
 ; BE-PWR9-NEXT:    li r4, 11
-; BE-PWR9-NEXT:    li r5, 35
-; BE-PWR9-NEXT:    li r7, 27
 ; BE-PWR9-NEXT:    addi r3, r3, g@toc@l
-; BE-PWR9-NEXT:    lxvx vs0, r3, r6
-; BE-PWR9-NEXT:    ldx r4, r3, r4
-; BE-PWR9-NEXT:    ldx r5, r3, r5
-; BE-PWR9-NEXT:    stdx r4, r3, r6
-; BE-PWR9-NEXT:    stxvx vs0, r3, r7
-; BE-PWR9-NEXT:    li r7, 43
-; BE-PWR9-NEXT:    stdx r5, r3, r7
+; BE-PWR9-NEXT:    lxvx vs0, r3, r4
+; BE-PWR9-NEXT:    li r4, 27
+; BE-PWR9-NEXT:    lxvx vs1, r3, r4
+; BE-PWR9-NEXT:    li r4, 35
+; BE-PWR9-NEXT:    stxvx vs1, r3, r4
+; BE-PWR9-NEXT:    li r4, 19
+; BE-PWR9-NEXT:    stxvx vs0, r3, r4
 ; BE-PWR9-NEXT:    blr
 ;
 ; BE-PWR8-LABEL: testUnalignedLdStPair:
 ; BE-PWR8:       # %bb.0: # %entry
 ; BE-PWR8-NEXT:    addis r3, r2, g@toc@ha
-; BE-PWR8-NEXT:    li r4, 19
-; BE-PWR8-NEXT:    li r5, 11
-; BE-PWR8-NEXT:    li r6, 35
-; BE-PWR8-NEXT:    li r7, 27
+; BE-PWR8-NEXT:    li r4, 11
+; BE-PWR8-NEXT:    li r5, 27
 ; BE-PWR8-NEXT:    addi r3, r3, g@toc@l
 ; BE-PWR8-NEXT:    lxvd2x vs0, r3, r4
-; BE-PWR8-NEXT:    ldx r5, r3, r5
-; BE-PWR8-NEXT:    ldx r6, r3, r6
-; BE-PWR8-NEXT:    stxvd2x vs0, r3, r7
-; BE-PWR8-NEXT:    li r7, 43
-; BE-PWR8-NEXT:    stdx r5, r3, r4
-; BE-PWR8-NEXT:    stdx r6, r3, r7
+; BE-PWR8-NEXT:    lxvd2x vs1, r3, r5
+; BE-PWR8-NEXT:    li r4, 35
+; BE-PWR8-NEXT:    li r5, 19
+; BE-PWR8-NEXT:    stxvd2x vs1, r3, r4
+; BE-PWR8-NEXT:    stxvd2x vs0, r3, r5
 ; BE-PWR8-NEXT:    blr
 entry:
   %0 = bitcast <256 x i1>* @g to i8*

diff --git a/llvm/test/CodeGen/X86/MergeConsecutiveStores.ll b/llvm/test/CodeGen/X86/MergeConsecutiveStores.ll
@@ -920,3 +920,31 @@ define void @merge_heterogeneous(%struct.C* nocapture %p, %struct.C* nocapture %
   ret void
 }
 
+define i32 @merge_store_load_store_seq(i32* %buff) {
+entry:
+; CHECK-LABEL: merge_store_load_store_seq:
+; CHECK:      movl 4(%rdi), %eax
+; CHECK-NEXT: movq $0, (%rdi)
+; CHECK-NEXT: retq
+
+  store i32 0, i32* %buff, align 4
+  %arrayidx1 = getelementptr inbounds i32, i32* %buff, i64 1
+  %0 = load i32, i32* %arrayidx1, align 4
+  store i32 0, i32* %arrayidx1, align 4
+  ret i32 %0
+}
+
+define i32 @merge_store_alias(i32* %buff, i32* %other) {
+entry:
+; CHECK-LABEL: merge_store_alias:
+; CHECK:  movl $0, (%rdi)
+; CHECK-NEXT:  movl (%rsi), %eax
+; CHECK-NEXT:  movl $0, 4(%rdi)
+; CHECK-NEXT:  retq
+
+  store i32 0, i32* %buff, align 4
+  %arrayidx1 = getelementptr inbounds i32, i32* %buff, i64 1
+  %0 = load i32, i32* %other, align 4
+  store i32 0, i32* %arrayidx1, align 4
+  ret i32 %0
+}