[LV] Do not add load to group if it moves across conflicting store.

This patch prevents invalid load groups from being formed, where a load needs to be moved across a conflicting store. Once we hit a store that conflicts with a load with an existing interleave group, we need to stop adding earlier loads to the group, as this would force hoisting the previous stores in the group across the conflicting load. To detect such cases, add a new CompletedLoadGroups set, which is used to keep track of load groups to which no earlier loads can be added. Fixes #63602 Reviewed By: anna Differential Revision: https://reviews.llvm.org/D154309
llvm · Jul 7, 2023 · 4d847bf · 4d847bf
1 parent fc9821a
commit 4d847bf
Show file tree

Hide file tree

Showing 2 changed files with 43 additions and 23 deletions.
diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp
@@ -1107,6 +1107,8 @@ void InterleavedAccessInfo::analyzeInterleaving(
   SmallSetVector<InterleaveGroup<Instruction> *, 4> StoreGroups;
   // Holds all interleaved load groups temporarily.
   SmallSetVector<InterleaveGroup<Instruction> *, 4> LoadGroups;
+  // Groups added to this set cannot have new members added.
+  SmallPtrSet<InterleaveGroup<Instruction> *, 4> CompletedLoadGroups;
 
   // Search in bottom-up program order for pairs of accesses (A and B) that can
   // form interleaved load or store groups. In the algorithm below, access A
@@ -1139,8 +1141,12 @@ void InterleavedAccessInfo::analyzeInterleaving(
       }
       if (B->mayWriteToMemory())
         StoreGroups.insert(Group);
-      else
+      else {
+        // Skip B if no new instructions can be added to its load group.
+        if (CompletedLoadGroups.contains(Group))
+          continue;
         LoadGroups.insert(Group);
+      }
     }
 
     for (auto AI = std::next(BI); AI != E; ++AI) {
@@ -1181,6 +1187,18 @@ void InterleavedAccessInfo::analyzeInterleaving(
           StoreGroups.remove(StoreGroup);
           releaseGroup(StoreGroup);
         }
+        // If B is a load and part of an interleave group, no earlier loads can
+        // be added to B's interleave group, because this would mean the load B
+        // would need to be moved across store A. Mark the interleave group as
+        // complete.
+        if (isInterleaved(B) && isa<LoadInst>(B)) {
+          InterleaveGroup<Instruction> *LoadGroup = getInterleaveGroup(B);
+
+          LLVM_DEBUG(dbgs() << "LV: Marking interleave group for " << *B
+                            << " as complete.\n");
+
+          CompletedLoadGroups.insert(LoadGroup);
+        }
 
         // If a dependence exists and A is not already in a group (or it was
         // and we just released it), B might be hoisted above A (if B is a

diff --git a/llvm/test/Transforms/LoopVectorize/X86/interleaved-accesses-hoist-load-across-store.ll b/llvm/test/Transforms/LoopVectorize/X86/interleaved-accesses-hoist-load-across-store.ll
@@ -28,38 +28,41 @@ define void @pr63602_1(ptr %arr) {
 ; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[OFFSET_IDX2]], 0
 ; CHECK-NEXT:    [[TMP7:%.*]] = add nuw nsw i64 [[TMP6]], 4
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 -2
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <12 x i32>, ptr [[TMP9]], align 4
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
-; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
-; CHECK-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP1]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP2]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP3]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <4 x i32> [[STRIDED_VEC4]], i32 0
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <4 x i32> [[STRIDED_VEC]], i32 0
 ; CHECK-NEXT:    store i32 [[TMP14]], ptr [[TMP10]], align 4
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <4 x i32> [[STRIDED_VEC4]], i32 1
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <4 x i32> [[STRIDED_VEC]], i32 1
 ; CHECK-NEXT:    store i32 [[TMP15]], ptr [[TMP11]], align 4
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <4 x i32> [[STRIDED_VEC4]], i32 2
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <4 x i32> [[STRIDED_VEC]], i32 2
 ; CHECK-NEXT:    store i32 [[TMP16]], ptr [[TMP12]], align 4
-; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <4 x i32> [[STRIDED_VEC4]], i32 3
+; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <4 x i32> [[STRIDED_VEC]], i32 3
 ; CHECK-NEXT:    store i32 [[TMP17]], ptr [[TMP13]], align 4
-; CHECK-NEXT:    [[TMP18:%.*]] = add <4 x i32> [[STRIDED_VEC3]], [[STRIDED_VEC]]
-; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <4 x i32> [[TMP18]], i32 0
-; CHECK-NEXT:    store i32 [[TMP19]], ptr [[TMP10]], align 4
-; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <4 x i32> [[TMP18]], i32 1
-; CHECK-NEXT:    store i32 [[TMP20]], ptr [[TMP11]], align 4
-; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <4 x i32> [[TMP18]], i32 2
-; CHECK-NEXT:    store i32 [[TMP21]], ptr [[TMP12]], align 4
-; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <4 x i32> [[TMP18]], i32 3
-; CHECK-NEXT:    store i32 [[TMP22]], ptr [[TMP13]], align 4
+; CHECK-NEXT:    [[TMP18:%.*]] = add nuw nsw i64 [[TMP6]], 2
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP18]]
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i32 0
+; CHECK-NEXT:    [[WIDE_VEC3:%.*]] = load <12 x i32>, ptr [[TMP20]], align 4
+; CHECK-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <12 x i32> [[WIDE_VEC3]], <12 x i32> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
+; CHECK-NEXT:    [[STRIDED_VEC5:%.*]] = shufflevector <12 x i32> [[WIDE_VEC3]], <12 x i32> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
+; CHECK-NEXT:    [[TMP21:%.*]] = add <4 x i32> [[STRIDED_VEC5]], [[STRIDED_VEC4]]
+; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <4 x i32> [[TMP21]], i32 0
+; CHECK-NEXT:    store i32 [[TMP22]], ptr [[TMP10]], align 4
+; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <4 x i32> [[TMP21]], i32 1
+; CHECK-NEXT:    store i32 [[TMP23]], ptr [[TMP11]], align 4
+; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <4 x i32> [[TMP21]], i32 2
+; CHECK-NEXT:    store i32 [[TMP24]], ptr [[TMP12]], align 4
+; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <4 x i32> [[TMP21]], i32 3
+; CHECK-NEXT:    store i32 [[TMP25]], ptr [[TMP13]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
-; CHECK-NEXT:    br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
+; CHECK-NEXT:    br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 17, 16
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 49, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i64 [ 52, [[MIDDLE_BLOCK]] ], [ 4, [[ENTRY]] ]
@@ -81,7 +84,7 @@ define void @pr63602_1(ptr %arr) {
 ; CHECK-NEXT:    store i32 [[ADD]], ptr [[GEP_IV_2]], align 4
 ; CHECK-NEXT:    [[IV_2_NEXT]] = add nuw nsw i64 [[IV_2]], 3
 ; CHECK-NEXT:    [[ICMP:%.*]] = icmp ugt i64 [[IV_2]], 50
-; CHECK-NEXT:    br i1 [[ICMP]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT:    br i1 [[ICMP]], label [[EXIT:%.*]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -110,4 +113,3 @@ loop:
 exit:
   ret void
 }
-