[LICM] Infer proper alignment from loads during scalar promotion

This patch fixes an issue where we would compute an unnecessarily small alignment during scalar promotion when no store is not to be guaranteed to execute, but we've proven load speculation safety. Since speculating a load requires proving the existing alignment is valid at the new location (see Loads.cpp), we can use the alignment fact from the load. For non-atomics, this is a performance problem. For atomics, this is a correctness issue, though an *incredibly* rare one to see in practice. For atomics, we might not be able to lower an improperly aligned load or store (i.e. i32 align 1). If such an instruction makes it all the way to codegen, we *may* fail to codegen the operation, or we may simply generate a slow call to a library function. The part that makes this super hard to see in practice is that the memory location actually *is* well aligned, and instcombine knows that. So, to see a failure, you have to have a) hit the bug in LICM, b) somehow hit a depth limit in InstCombine/ValueTracking to avoid fixing the alignment, and c) then have generated an instruction which fails codegen rather than simply emitting a slow libcall. All around, pretty hard to hit. Differential Revision: https://reviews.llvm.org/D58809 llvm-svn: 355217
llvm · Mar 1, 2019 · 2226e9a · 2226e9a
1 parent 06ed385
commit 2226e9a
Show file tree

Hide file tree

Showing 3 changed files with 33 additions and 15 deletions.
diff --git a/llvm/lib/Transforms/Scalar/LICM.cpp b/llvm/lib/Transforms/Scalar/LICM.cpp
@@ -1924,9 +1924,21 @@ bool llvm::promoteLoopAccessesToScalars(
         SawUnorderedAtomic |= Load->isAtomic();
         SawNotAtomic |= !Load->isAtomic();
 
-        if (!DereferenceableInPH)
-          DereferenceableInPH = isSafeToExecuteUnconditionally(
-              *Load, DT, CurLoop, SafetyInfo, ORE, Preheader->getTerminator());
+        unsigned InstAlignment = Load->getAlignment();
+        if (!InstAlignment)
+          InstAlignment =
+              MDL.getABITypeAlignment(Load->getType());
+
+        // Note that proving a load safe to speculate requires proving
+        // sufficient alignment at the target location.  Proving it guaranteed
+        // to execute does as well.  Thus we can increase our guaranteed
+        // alignment as well. 
+        if (!DereferenceableInPH || (InstAlignment > Alignment))
+          if (isSafeToExecuteUnconditionally(*Load, DT, CurLoop, SafetyInfo,
+                                             ORE, Preheader->getTerminator())) {
+            DereferenceableInPH = true;
+            Alignment = std::max(Alignment, InstAlignment);
+          }
       } else if (const StoreInst *Store = dyn_cast<StoreInst>(UI)) {
         // Stores *of* the pointer are not interesting, only stores *to* the
         // pointer.
@@ -1997,6 +2009,14 @@ bool llvm::promoteLoopAccessesToScalars(
   if (SawUnorderedAtomic && SawNotAtomic)
     return false;
 
+  // If we're inserting an atomic load in the preheader, we must be able to
+  // lower it.  We're only guaranteed to be able to lower naturally aligned
+  // atomics.
+  auto *SomePtrElemType = SomePtr->getType()->getPointerElementType();
+  if (SawUnorderedAtomic &&
+      Alignment < MDL.getTypeStoreSize(SomePtrElemType))
+    return false;
+
   // If we couldn't prove we can hoist the load, bail.
   if (!DereferenceableInPH)
     return false;

diff --git a/llvm/test/Transforms/LICM/promote-tls.ll b/llvm/test/Transforms/LICM/promote-tls.ll
@@ -22,7 +22,7 @@ entry:
 
 for.body.lr.ph:                                   ; preds = %entry
 ; CHECK-LABEL: for.body.lr.ph:
-; CHECK-NEXT: %addr.promoted = load i32, i32* %addr, align 1
+; CHECK-NEXT: %addr.promoted = load i32, i32* %addr, align 4
   br label %for.header
 
 for.header:
@@ -35,7 +35,7 @@ for.header:
 
 early-exit:
 ; CHECK-LABEL: early-exit:
-; CHECK: store i32 %new1.lcssa, i32* %addr, align 1
+; CHECK: store i32 %new1.lcssa, i32* %addr, align 4
   ret i32* null
 
 for.body:
@@ -47,7 +47,7 @@ for.body:
 
 for.cond.for.end_crit_edge:                       ; preds = %for.body
 ; CHECK-LABEL: for.cond.for.end_crit_edge:
-; CHECK: store i32 %new.lcssa, i32* %addr, align 1
+; CHECK: store i32 %new.lcssa, i32* %addr, align 4
   %split = phi i32* [ %addr, %for.body ]
   ret i32* null
 }
@@ -62,7 +62,7 @@ entry:
 
 for.body.lr.ph:                                   ; preds = %entry
 ; CHECK-LABEL: for.body.lr.ph:
-; CHECK-NEXT: %addr.promoted = load i32, i32* %addr, align 1
+; CHECK-NEXT: %addr.promoted = load i32, i32* %addr, align 4
   br label %for.header
 
 for.header:
@@ -75,7 +75,7 @@ for.header:
 
 early-exit:
 ; CHECK-LABEL: early-exit:
-; CHECK: store i32 %new1.lcssa, i32* %addr, align 1
+; CHECK: store i32 %new1.lcssa, i32* %addr, align 4
   ret i32* null
 
 for.body:
@@ -87,7 +87,7 @@ for.body:
 
 for.cond.for.end_crit_edge:                       ; preds = %for.body
 ; CHECK-LABEL: for.cond.for.end_crit_edge:
-; CHECK: store i32 %new.lcssa, i32* %addr, align 1
+; CHECK: store i32 %new.lcssa, i32* %addr, align 4
   %split = phi i32* [ %addr, %for.body ]
   ret i32* null
 }

diff --git a/llvm/test/Transforms/LICM/scalar-promote-unwind.ll b/llvm/test/Transforms/LICM/scalar-promote-unwind.ll
@@ -74,7 +74,7 @@ define void @test3(i1 zeroext %y) uwtable {
 entry:
 ; CHECK-LABEL: entry:
 ; CHECK-NEXT:  %a = alloca i32
-; CHECK-NEXT:  %a.promoted = load i32, i32* %a, align 1
+; CHECK-NEXT:  %a.promoted = load i32, i32* %a, align 4
   %a = alloca i32
   br label %for.body
 
@@ -90,20 +90,18 @@ for.body:
 
 for.cond.cleanup:
 ; CHECK-LABEL: for.cond.cleanup:
-; CHECK: store i32 %add.lcssa, i32* %a, align 1
+; CHECK: store i32 %add.lcssa, i32* %a, align 4
 ; CHECK-NEXT: ret void
   ret void
 }
 
 ;; Same as test3, but with unordered atomics
-;; FIXME: doing the transform w/o alignment here is wrong since we're
-;; creating an unaligned atomic which we may not be able to lower.
 define void @test3b(i1 zeroext %y) uwtable {
 ; CHECK-LABEL: @test3
 entry:
 ; CHECK-LABEL: entry:
 ; CHECK-NEXT:  %a = alloca i32
-; CHECK-NEXT:  %a.promoted = load atomic i32, i32* %a unordered, align 1
+; CHECK-NEXT:  %a.promoted = load atomic i32, i32* %a unordered, align 4
   %a = alloca i32
   br label %for.body
 
@@ -119,7 +117,7 @@ for.body:
 
 for.cond.cleanup:
 ; CHECK-LABEL: for.cond.cleanup:
-; CHECK: store atomic i32 %add.lcssa, i32* %a unordered, align 1
+; CHECK: store atomic i32 %add.lcssa, i32* %a unordered, align 4
 ; CHECK-NEXT: ret void
   ret void
 }