[Attributor] Use CFG reasoning to filter potentially interfering writes

Since D104432 we can look through memory by analyzing all writes that might interfere with a load. This patch provides some logic to exclude writes that cannot interfere with a location, due to CFG reasoning. We make sure to avoid multi-thread write-read situations properly while we ignore writes that cannot reach a load or writes that will be overwritten before the load is reached. Differential Revision: https://reviews.llvm.org/D106397
llvm · Feb 1, 2022 · e140d51 · e140d51
1 parent 191fa41
commit e140d51
Show file tree

Hide file tree

Showing 13 changed files with 206 additions and 89 deletions.
diff --git a/llvm/include/llvm/Transforms/IPO/Attributor.h b/llvm/include/llvm/Transforms/IPO/Attributor.h
@@ -4745,6 +4745,15 @@ struct AAPointerInfo : public AbstractAttribute {
   virtual bool forallInterferingAccesses(
       StoreInst &SI, function_ref<bool(const Access &, bool)> CB) const = 0;
 
+  /// Call \p CB on all write accesses that might interfere with \p LI and
+  /// return true if all such accesses were known and the callback returned true
+  /// for all of them, false otherwise. In contrast to forallInterferingAccesses
+  /// this function will perform reasoning to exclude write accesses that cannot
+  /// affect the load even if they on the surface look as if they would.
+  virtual bool forallInterferingWrites(
+      Attributor &A, const AbstractAttribute &QueryingAA, LoadInst &LI,
+      function_ref<bool(const Access &, bool)> CB) const = 0;
+
   /// This function should return true if the type of the \p AA is AAPointerInfo
   static bool classof(const AbstractAttribute *AA) {
     return (AA->getIdAddr() == &ID);

diff --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
@@ -16,6 +16,7 @@
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/SCCIterator.h"
 #include "llvm/ADT/SetOperations.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
@@ -68,6 +69,12 @@ static cl::opt<unsigned, true> MaxPotentialValues(
     cl::location(llvm::PotentialConstantIntValuesState::MaxPotentialValues),
     cl::init(7));
 
+static cl::opt<unsigned>
+    MaxInterferingWrites("attributor-max-interfering-writes", cl::Hidden,
+                         cl::desc("Maximum number of interfering writes to "
+                                  "check before assuming all might interfere."),
+                         cl::init(6));
+
 STATISTIC(NumAAs, "Number of abstract attributes created");
 
 // Some helper macros to deal with statistics tracking.
@@ -1057,6 +1064,120 @@ struct AAPointerInfoImpl
       const override {
     return State::forallInterferingAccesses(SI, CB);
   }
+  bool forallInterferingWrites(
+      Attributor &A, const AbstractAttribute &QueryingAA, LoadInst &LI,
+      function_ref<bool(const Access &, bool)> UserCB) const override {
+    SmallPtrSet<const Access *, 8> DominatingWrites;
+    SmallVector<std::pair<const Access *, bool>, 8> InterferingWrites;
+
+    Function &Scope = *LI.getFunction();
+    const auto &NoSyncAA = A.getAAFor<AANoSync>(
+        QueryingAA, IRPosition::function(Scope), DepClassTy::OPTIONAL);
+    const auto *ExecDomainAA = A.lookupAAFor<AAExecutionDomain>(
+        IRPosition::function(Scope), &QueryingAA, DepClassTy::OPTIONAL);
+    const bool NoSync = NoSyncAA.isAssumedNoSync();
+
+    // Helper to determine if we need to consider threading, which we cannot
+    // right now. However, if the function is (assumed) nosync or the thread
+    // executing all instructions is the main thread only we can ignore
+    // threading.
+    auto CanIgnoreThreading = [&](const Instruction &I) -> bool {
+      if (NoSync)
+        return true;
+      if (ExecDomainAA && ExecDomainAA->isExecutedByInitialThreadOnly(I))
+        return true;
+      return false;
+    };
+
+    // Helper to determine if the access is executed by the same thread as the
+    // load, for now it is sufficient to avoid any potential threading effects
+    // as we cannot deal with them anyway.
+    auto IsSameThreadAsLoad = [&](const Access &Acc) -> bool {
+      return CanIgnoreThreading(*Acc.getLocalInst());
+    };
+
+    // TODO: Use inter-procedural reachability and dominance.
+    const auto &NoRecurseAA = A.getAAFor<AANoRecurse>(
+        QueryingAA, IRPosition::function(*LI.getFunction()),
+        DepClassTy::OPTIONAL);
+
+    // Helper to determine if the instruction may reach the load.
+    auto IsReachableFrom = [&](const Instruction &I) {
+      const auto &ReachabilityAA = A.getAAFor<AAReachability>(
+          QueryingAA, IRPosition::function(*I.getFunction()),
+          DepClassTy::OPTIONAL);
+      return ReachabilityAA.isAssumedReachable(A, I, LI);
+    };
+
+    const bool CanUseCFGResoning =
+        NoRecurseAA.isKnownNoRecurse() && CanIgnoreThreading(LI);
+    InformationCache &InfoCache = A.getInfoCache();
+    const DominatorTree *DT =
+        InfoCache.getAnalysisResultForFunction<DominatorTreeAnalysis>(Scope);
+
+    auto AccessCB = [&](const Access &Acc, bool Exact) {
+      if (!Acc.isWrite())
+        return true;
+
+      // For now we only filter accesses based on CFG reasoning which does not
+      // work yet if we have threading effects, or the access is complicated.
+      if (CanUseCFGResoning) {
+        if (!IsReachableFrom(*Acc.getLocalInst()))
+          return true;
+        if (DT && Exact &&
+            (Acc.getLocalInst()->getFunction() == LI.getFunction()) &&
+            IsSameThreadAsLoad(Acc)) {
+          if (DT->dominates(Acc.getLocalInst(), &LI))
+            DominatingWrites.insert(&Acc);
+        }
+      }
+
+      InterferingWrites.push_back({&Acc, Exact});
+      return true;
+    };
+    if (!State::forallInterferingAccesses(LI, AccessCB))
+      return false;
+
+    // If we cannot use CFG reasoning we only filter the non-write accesses
+    // and are done here.
+    if (!CanUseCFGResoning) {
+      for (auto &It : InterferingWrites)
+        if (!UserCB(*It.first, It.second))
+          return false;
+      return true;
+    }
+
+    // Helper to determine if we can skip a specific write access. This is in
+    // the worst case quadratic as we are looking for another write that will
+    // hide the effect of this one.
+    auto CanSkipAccess = [&](const Access &Acc, bool Exact) {
+      if (!IsSameThreadAsLoad(Acc))
+        return false;
+      if (!DominatingWrites.count(&Acc))
+        return false;
+      for (const Access *DomAcc : DominatingWrites) {
+        assert(Acc.getLocalInst()->getFunction() ==
+                   DomAcc->getLocalInst()->getFunction() &&
+               "Expected dominating writes to be in the same function!");
+
+        if (DomAcc != &Acc &&
+            DT->dominates(Acc.getLocalInst(), DomAcc->getLocalInst())) {
+          return true;
+        }
+      }
+      return false;
+    };
+
+    // Run the user callback on all writes we cannot skip and return if that
+    // succeeded for all or not.
+    unsigned NumInterferingWrites = InterferingWrites.size();
+    for (auto &It : InterferingWrites)
+      if (!DT || NumInterferingWrites > MaxInterferingWrites ||
+          !CanSkipAccess(*It.first, It.second))
+        if (!UserCB(*It.first, It.second))
+          return false;
+    return true;
+  }
 
   ChangeStatus translateAndAddCalleeState(Attributor &A,
                                           const AAPointerInfo &CalleeAA,
@@ -2878,6 +2999,10 @@ struct AAReachabilityImpl : AAReachability {
 
   /// See AbstractAttribute::updateImpl(...).
   ChangeStatus updateImpl(Attributor &A) override {
+    const auto &NoRecurseAA = A.getAAFor<AANoRecurse>(
+        *this, IRPosition::function(*getAnchorScope()), DepClassTy::REQUIRED);
+    if (!NoRecurseAA.isAssumedNoRecurse())
+      return indicatePessimisticFixpoint();
     return ChangeStatus::UNCHANGED;
   }
 };
@@ -5238,8 +5363,6 @@ struct AAValueSimplifyImpl : AAValueSimplify {
 
       auto CheckAccess = [&](const AAPointerInfo::Access &Acc, bool IsExact) {
         LLVM_DEBUG(dbgs() << " - visit access " << Acc << "\n");
-        if (!Acc.isWrite())
-          return true;
         if (Acc.isWrittenValueYetUndetermined())
           return true;
         Value *Content = Acc.getWrittenValue();
@@ -5259,7 +5382,7 @@ struct AAValueSimplifyImpl : AAValueSimplify {
 
       auto &PI = A.getAAFor<AAPointerInfo>(AA, IRPosition::value(*Obj),
                                            DepClassTy::REQUIRED);
-      if (!PI.forallInterferingAccesses(L, CheckAccess))
+      if (!PI.forallInterferingWrites(A, AA, L, CheckAccess))
         return false;
     }
     return true;

diff --git a/llvm/test/Transforms/Attributor/ArgumentPromotion/X86/attributes.ll b/llvm/test/Transforms/Attributor/ArgumentPromotion/X86/attributes.ll
@@ -213,5 +213,5 @@ attributes #2 = { argmemonly nounwind }
 ; IS__CGSCC____: attributes #[[ATTR1:[0-9]+]] = { argmemonly nofree nosync nounwind uwtable willreturn }
 ; IS__CGSCC____: attributes #[[ATTR2:[0-9]+]] = { argmemonly nofree nounwind willreturn writeonly }
 ; IS__CGSCC____: attributes #[[ATTR3:[0-9]+]] = { willreturn writeonly }
-; IS__CGSCC____: attributes #[[ATTR4:[0-9]+]] = { nounwind willreturn }
+; IS__CGSCC____: attributes #[[ATTR4:[0-9]+]] = { nosync nounwind willreturn }
 ;.
diff --git a/llvm/test/Transforms/Attributor/ArgumentPromotion/X86/min-legal-vector-width.ll b/llvm/test/Transforms/Attributor/ArgumentPromotion/X86/min-legal-vector-width.ll
@@ -838,5 +838,5 @@ attributes #5 = { argmemonly nounwind }
 ; IS__CGSCC____: attributes #[[ATTR4:[0-9]+]] = { argmemonly inlinehint nofree norecurse nosync nounwind uwtable willreturn "min-legal-vector-width"="256" "prefer-vector-width"="256" "target-features"="+avx2" }
 ; IS__CGSCC____: attributes #[[ATTR5:[0-9]+]] = { argmemonly nofree nounwind willreturn writeonly }
 ; IS__CGSCC____: attributes #[[ATTR6:[0-9]+]] = { willreturn writeonly }
-; IS__CGSCC____: attributes #[[ATTR7:[0-9]+]] = { nounwind willreturn }
+; IS__CGSCC____: attributes #[[ATTR7:[0-9]+]] = { nosync nounwind willreturn }
 ;.
diff --git a/llvm/test/Transforms/Attributor/ArgumentPromotion/attrs.ll b/llvm/test/Transforms/Attributor/ArgumentPromotion/attrs.ll
@@ -60,19 +60,15 @@ define internal i32 @f(%struct.ss* byval(%struct.ss) %b, i32* byval(i32) %X, i32
 ; IS__CGSCC_NPM-SAME: (i32 [[TMP0:%.*]], i64 [[TMP1:%.*]], i32 [[TMP2:%.*]]) #[[ATTR0:[0-9]+]] {
 ; IS__CGSCC_NPM-NEXT:  entry:
 ; IS__CGSCC_NPM-NEXT:    [[X_PRIV:%.*]] = alloca i32, align 4
-; IS__CGSCC_NPM-NEXT:    store i32 [[TMP2]], i32* [[X_PRIV]], align 4
 ; IS__CGSCC_NPM-NEXT:    [[B_PRIV:%.*]] = alloca [[STRUCT_SS:%.*]], align 8
 ; IS__CGSCC_NPM-NEXT:    [[B_PRIV_CAST:%.*]] = bitcast %struct.ss* [[B_PRIV]] to i32*
-; IS__CGSCC_NPM-NEXT:    store i32 1, i32* [[B_PRIV_CAST]], align 8
 ; IS__CGSCC_NPM-NEXT:    [[B_PRIV_0_1:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[B_PRIV]], i64 0, i32 1
 ; IS__CGSCC_NPM-NEXT:    [[TMP:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[B_PRIV]], i32 0, i32 0
 ; IS__CGSCC_NPM-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP]], align 8
-; IS__CGSCC_NPM-NEXT:    [[TMP2:%.*]] = add i32 [[TMP1]], 1
-; IS__CGSCC_NPM-NEXT:    store i32 [[TMP2]], i32* [[TMP]], align 8
-; IS__CGSCC_NPM-NEXT:    store i32 0, i32* [[X_PRIV]], align 4
+; IS__CGSCC_NPM-NEXT:    [[TMP2:%.*]] = add i32 1, 1
 ; IS__CGSCC_NPM-NEXT:    [[L:%.*]] = load i32, i32* [[X_PRIV]], align 4
-; IS__CGSCC_NPM-NEXT:    [[A:%.*]] = add i32 [[L]], [[TMP2]]
-; IS__CGSCC_NPM-NEXT:    ret i32 [[A]]
+; IS__CGSCC_NPM-NEXT:    [[A:%.*]] = add i32 0, 2
+; IS__CGSCC_NPM-NEXT:    ret i32 undef
 ;
 entry:
 
@@ -128,16 +124,14 @@ define i32 @test(i32* %X) {
 ; IS__CGSCC_OPM-NEXT:    [[C:%.*]] = call i32 @f(%struct.ss* noalias nocapture nofree noundef nonnull readonly byval([[STRUCT_SS]]) align 8 dereferenceable(12) [[S]], i32* noalias nocapture nofree noundef nonnull readonly byval(i32) align 4 dereferenceable(4) [[X]]) #[[ATTR1:[0-9]+]]
 ; IS__CGSCC_OPM-NEXT:    ret i32 [[C]]
 ;
-; IS__CGSCC_NPM: Function Attrs: argmemonly nofree norecurse nosync nounwind willreturn
+; IS__CGSCC_NPM: Function Attrs: nofree norecurse nosync nounwind readnone willreturn
 ; IS__CGSCC_NPM-LABEL: define {{[^@]+}}@test
-; IS__CGSCC_NPM-SAME: (i32* nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[X:%.*]]) #[[ATTR1:[0-9]+]] {
+; IS__CGSCC_NPM-SAME: (i32* nocapture nofree nonnull readnone align 4 dereferenceable(4) [[X:%.*]]) #[[ATTR0]] {
 ; IS__CGSCC_NPM-NEXT:  entry:
 ; IS__CGSCC_NPM-NEXT:    [[S:%.*]] = alloca [[STRUCT_SS:%.*]], align 8
 ; IS__CGSCC_NPM-NEXT:    [[TMP1:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[S]], i32 0, i32 0
 ; IS__CGSCC_NPM-NEXT:    [[TMP4:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[S]], i32 0, i32 1
-; IS__CGSCC_NPM-NEXT:    [[TMP0:%.*]] = load i32, i32* [[X]], align 4
-; IS__CGSCC_NPM-NEXT:    [[C:%.*]] = call i32 @f(i32 undef, i64 undef, i32 [[TMP0]]) #[[ATTR2:[0-9]+]]
-; IS__CGSCC_NPM-NEXT:    ret i32 [[C]]
+; IS__CGSCC_NPM-NEXT:    ret i32 2
 ;
 entry:
   %S = alloca %struct.ss
@@ -158,6 +152,4 @@ entry:
 ; IS__CGSCC_OPM: attributes #[[ATTR1]] = { nounwind willreturn }
 ;.
 ; IS__CGSCC_NPM: attributes #[[ATTR0]] = { nofree norecurse nosync nounwind readnone willreturn }
-; IS__CGSCC_NPM: attributes #[[ATTR1]] = { argmemonly nofree norecurse nosync nounwind willreturn }
-; IS__CGSCC_NPM: attributes #[[ATTR2]] = { nounwind readnone willreturn }
 ;.
diff --git a/llvm/test/Transforms/Attributor/ArgumentPromotion/byval-2.ll b/llvm/test/Transforms/Attributor/ArgumentPromotion/byval-2.ll
@@ -54,12 +54,10 @@ define internal void @f(%struct.ss* byval(%struct.ss)  %b, i32* byval(i32) %X) n
 ; IS__CGSCC_NPM-NEXT:    [[X_PRIV:%.*]] = alloca i32, align 4
 ; IS__CGSCC_NPM-NEXT:    [[B_PRIV:%.*]] = alloca [[STRUCT_SS:%.*]], align 8
 ; IS__CGSCC_NPM-NEXT:    [[B_PRIV_CAST:%.*]] = bitcast %struct.ss* [[B_PRIV]] to i32*
-; IS__CGSCC_NPM-NEXT:    store i32 1, i32* [[B_PRIV_CAST]], align 8
 ; IS__CGSCC_NPM-NEXT:    [[B_PRIV_0_1:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[B_PRIV]], i64 0, i32 1
 ; IS__CGSCC_NPM-NEXT:    [[TMP:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[B_PRIV]], i32 0, i32 0
 ; IS__CGSCC_NPM-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP]], align 8
-; IS__CGSCC_NPM-NEXT:    [[TMP2:%.*]] = add i32 [[TMP1]], 1
-; IS__CGSCC_NPM-NEXT:    store i32 [[TMP2]], i32* [[TMP]], align 8
+; IS__CGSCC_NPM-NEXT:    [[TMP2:%.*]] = add i32 1, 1
 ; IS__CGSCC_NPM-NEXT:    ret void
 ;
 entry:

diff --git a/llvm/test/Transforms/Attributor/ArgumentPromotion/byval.ll b/llvm/test/Transforms/Attributor/ArgumentPromotion/byval.ll
@@ -50,13 +50,11 @@ define internal i32 @f(%struct.ss* byval(%struct.ss)  %b) nounwind  {
 ; IS__CGSCC_NPM-NEXT:  entry:
 ; IS__CGSCC_NPM-NEXT:    [[B_PRIV:%.*]] = alloca [[STRUCT_SS:%.*]], align 4
 ; IS__CGSCC_NPM-NEXT:    [[B_PRIV_CAST:%.*]] = bitcast %struct.ss* [[B_PRIV]] to i32*
-; IS__CGSCC_NPM-NEXT:    store i32 1, i32* [[B_PRIV_CAST]], align 8
 ; IS__CGSCC_NPM-NEXT:    [[B_PRIV_0_1:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[B_PRIV]], i64 0, i32 1
 ; IS__CGSCC_NPM-NEXT:    [[TMP:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[B_PRIV]], i32 0, i32 0
 ; IS__CGSCC_NPM-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP]], align 8
-; IS__CGSCC_NPM-NEXT:    [[TMP2:%.*]] = add i32 [[TMP1]], 1
-; IS__CGSCC_NPM-NEXT:    store i32 [[TMP2]], i32* [[TMP]], align 8
-; IS__CGSCC_NPM-NEXT:    ret i32 [[TMP1]]
+; IS__CGSCC_NPM-NEXT:    [[TMP2:%.*]] = add i32 1, 1
+; IS__CGSCC_NPM-NEXT:    ret i32 undef
 ;
 entry:
   %tmp = getelementptr %struct.ss, %struct.ss* %b, i32 0, i32 0
@@ -109,13 +107,11 @@ define internal i32 @g(%struct.ss* byval(%struct.ss) align 32 %b) nounwind {
 ; IS__CGSCC_NPM-NEXT:  entry:
 ; IS__CGSCC_NPM-NEXT:    [[B_PRIV:%.*]] = alloca [[STRUCT_SS:%.*]], align 4
 ; IS__CGSCC_NPM-NEXT:    [[B_PRIV_CAST:%.*]] = bitcast %struct.ss* [[B_PRIV]] to i32*
-; IS__CGSCC_NPM-NEXT:    store i32 1, i32* [[B_PRIV_CAST]], align 32
 ; IS__CGSCC_NPM-NEXT:    [[B_PRIV_0_1:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[B_PRIV]], i64 0, i32 1
 ; IS__CGSCC_NPM-NEXT:    [[TMP:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[B_PRIV]], i32 0, i32 0
 ; IS__CGSCC_NPM-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP]], align 32
-; IS__CGSCC_NPM-NEXT:    [[TMP2:%.*]] = add i32 [[TMP1]], 1
-; IS__CGSCC_NPM-NEXT:    store i32 [[TMP2]], i32* [[TMP]], align 32
-; IS__CGSCC_NPM-NEXT:    ret i32 [[TMP2]]
+; IS__CGSCC_NPM-NEXT:    [[TMP2:%.*]] = add i32 1, 1
+; IS__CGSCC_NPM-NEXT:    ret i32 undef
 ;
 entry:
   %tmp = getelementptr %struct.ss, %struct.ss* %b, i32 0, i32 0
@@ -148,15 +144,15 @@ define i32 @main() nounwind  {
 ; IS__TUNIT_NPM-NEXT:    [[TMP1:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[S]], i32 0, i32 0
 ; IS__TUNIT_NPM-NEXT:    store i32 1, i32* [[TMP1]], align 8
 ; IS__TUNIT_NPM-NEXT:    [[TMP4:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[S]], i32 0, i32 1
-; IS__TUNIT_NPM-NEXT:    [[S_CAST:%.*]] = bitcast %struct.ss* [[S]] to i32*
-; IS__TUNIT_NPM-NEXT:    [[TMP0:%.*]] = load i32, i32* [[S_CAST]], align 8
-; IS__TUNIT_NPM-NEXT:    [[S_0_1:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[S]], i64 0, i32 1
-; IS__TUNIT_NPM-NEXT:    [[TMP1:%.*]] = load i64, i64* [[S_0_1]], align 8
-; IS__TUNIT_NPM-NEXT:    [[C0:%.*]] = call i32 @f(i32 [[TMP0]], i64 [[TMP1]]) #[[ATTR2:[0-9]+]]
 ; IS__TUNIT_NPM-NEXT:    [[S_CAST1:%.*]] = bitcast %struct.ss* [[S]] to i32*
-; IS__TUNIT_NPM-NEXT:    [[TMP2:%.*]] = load i32, i32* [[S_CAST1]], align 32
+; IS__TUNIT_NPM-NEXT:    [[TMP0:%.*]] = load i32, i32* [[S_CAST1]], align 8
 ; IS__TUNIT_NPM-NEXT:    [[S_0_12:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[S]], i64 0, i32 1
-; IS__TUNIT_NPM-NEXT:    [[TMP3:%.*]] = load i64, i64* [[S_0_12]], align 32
+; IS__TUNIT_NPM-NEXT:    [[TMP1:%.*]] = load i64, i64* [[S_0_12]], align 8
+; IS__TUNIT_NPM-NEXT:    [[C0:%.*]] = call i32 @f(i32 [[TMP0]], i64 [[TMP1]]) #[[ATTR2:[0-9]+]]
+; IS__TUNIT_NPM-NEXT:    [[S_CAST:%.*]] = bitcast %struct.ss* [[S]] to i32*
+; IS__TUNIT_NPM-NEXT:    [[TMP2:%.*]] = load i32, i32* [[S_CAST]], align 32
+; IS__TUNIT_NPM-NEXT:    [[S_0_1:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[S]], i64 0, i32 1
+; IS__TUNIT_NPM-NEXT:    [[TMP3:%.*]] = load i64, i64* [[S_0_1]], align 32
 ; IS__TUNIT_NPM-NEXT:    [[C1:%.*]] = call i32 @g(i32 [[TMP2]], i64 [[TMP3]]) #[[ATTR2]]
 ; IS__TUNIT_NPM-NEXT:    [[A:%.*]] = add i32 [[C0]], [[C1]]
 ; IS__TUNIT_NPM-NEXT:    ret i32 [[A]]
@@ -181,10 +177,7 @@ define i32 @main() nounwind  {
 ; IS__CGSCC_NPM-NEXT:    [[S:%.*]] = alloca [[STRUCT_SS:%.*]], align 4
 ; IS__CGSCC_NPM-NEXT:    [[TMP1:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[S]], i32 0, i32 0
 ; IS__CGSCC_NPM-NEXT:    [[TMP4:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[S]], i32 0, i32 1
-; IS__CGSCC_NPM-NEXT:    [[C0:%.*]] = call i32 @f(i32 undef, i64 undef) #[[ATTR1:[0-9]+]]
-; IS__CGSCC_NPM-NEXT:    [[C1:%.*]] = call i32 @g(i32 undef, i64 undef) #[[ATTR1]]
-; IS__CGSCC_NPM-NEXT:    [[A:%.*]] = add i32 [[C0]], [[C1]]
-; IS__CGSCC_NPM-NEXT:    ret i32 [[A]]
+; IS__CGSCC_NPM-NEXT:    ret i32 3
 ;
 entry:
   %S = alloca %struct.ss
@@ -209,5 +202,4 @@ entry:
 ; IS__CGSCC_OPM: attributes #[[ATTR2]] = { nounwind willreturn }
 ;.
 ; IS__CGSCC_NPM: attributes #[[ATTR0]] = { nofree norecurse nosync nounwind readnone willreturn }
-; IS__CGSCC_NPM: attributes #[[ATTR1]] = { nounwind readnone willreturn }
 ;.