diff --git a/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp b/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp
index 450595cac57b45..ce9944a5ce4bee 100644
--- a/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp
+++ b/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp
@@ -902,6 +902,11 @@ MemDepResult MemoryDependenceResults::GetNonLocalInfoForBlock(
     Instruction *QueryInst, const MemoryLocation &Loc, bool isLoad,
     BasicBlock *BB, NonLocalDepInfo *Cache, unsigned NumSortedEntries) {
 
+  bool isInvariantLoad = false;
+
+  if (LoadInst *LI = dyn_cast_or_null<LoadInst>(QueryInst))
+    isInvariantLoad = LI->getMetadata(LLVMContext::MD_invariant_load);
+
   // Do a binary search to see if we already have an entry for this block in
   // the cache set.  If so, find it.
   NonLocalDepInfo::iterator Entry = std::upper_bound(
@@ -913,6 +918,13 @@ MemDepResult MemoryDependenceResults::GetNonLocalInfoForBlock(
   if (Entry != Cache->begin() + NumSortedEntries && Entry->getBB() == BB)
     ExistingResult = &*Entry;
 
+  // Use cached result for invariant load only if there is no dependency for non
+  // invariant load. In this case invariant load can not have any dependency as
+  // well.
+  if (ExistingResult && isInvariantLoad &&
+      !ExistingResult->getResult().isNonFuncLocal())
+    ExistingResult = nullptr;
+
   // If we have a cached entry, and it is non-dirty, use it as the value for
   // this dependency.
   if (ExistingResult && !ExistingResult->getResult().isDirty()) {
@@ -941,6 +953,10 @@ MemDepResult MemoryDependenceResults::GetNonLocalInfoForBlock(
   MemDepResult Dep =
       getPointerDependencyFrom(Loc, isLoad, ScanPos, BB, QueryInst);
 
+  // Don't cache results for invariant load.
+  if (isInvariantLoad)
+    return Dep;
+
   // If we had a dirty entry for the block, update it.  Otherwise, just add
   // a new entry.
   if (ExistingResult)
@@ -1029,6 +1045,10 @@ bool MemoryDependenceResults::getNonLocalPointerDepFromBB(
   InitialNLPI.Size = Loc.Size;
   InitialNLPI.AATags = Loc.AATags;
 
+  bool isInvariantLoad = false;
+  if (LoadInst *LI = dyn_cast_or_null<LoadInst>(QueryInst))
+    isInvariantLoad = LI->getMetadata(LLVMContext::MD_invariant_load);
+
   // Get the NLPI for CacheKey, inserting one into the map if it doesn't
   // already have one.
   std::pair<CachedNonLocalPointerInfo::iterator, bool> Pair =
@@ -1037,7 +1057,8 @@ bool MemoryDependenceResults::getNonLocalPointerDepFromBB(
 
   // If we already have a cache entry for this CacheKey, we may need to do some
   // work to reconcile the cache entry and the current query.
-  if (!Pair.second) {
+  // Invariant loads don't participate in caching. Thus no need to reconcile.
+  if (!isInvariantLoad && !Pair.second) {
     if (CacheInfo->Size != Loc.Size) {
       bool ThrowOutEverything;
       if (CacheInfo->Size.hasValue() && Loc.Size.hasValue()) {
@@ -1093,7 +1114,10 @@ bool MemoryDependenceResults::getNonLocalPointerDepFromBB(
 
   // If we have valid cached information for exactly the block we are
   // investigating, just return it with no recomputation.
-  if (CacheInfo->Pair == BBSkipFirstBlockPair(StartBB, SkipFirstBlock)) {
+  // Don't use cached information for invariant loads since it is valid for
+  // non-invariant loads only.
+  if (!isInvariantLoad &&
+      CacheInfo->Pair == BBSkipFirstBlockPair(StartBB, SkipFirstBlock)) {
     // We have a fully cached result for this query then we can just return the
     // cached results and populate the visited set.  However, we have to verify
     // that we don't already have conflicting results for these blocks.  Check
@@ -1129,14 +1153,18 @@ bool MemoryDependenceResults::getNonLocalPointerDepFromBB(
     return true;
   }
 
-  // Otherwise, either this is a new block, a block with an invalid cache
-  // pointer or one that we're about to invalidate by putting more info into it
-  // than its valid cache info.  If empty, the result will be valid cache info,
-  // otherwise it isn't.
-  if (Cache->empty())
-    CacheInfo->Pair = BBSkipFirstBlockPair(StartBB, SkipFirstBlock);
-  else
-    CacheInfo->Pair = BBSkipFirstBlockPair();
+  // Invariant loads don't affect cache in any way thus no need to update
+  // CacheInfo as well.
+  if (!isInvariantLoad) {
+    // Otherwise, either this is a new block, a block with an invalid cache
+    // pointer or one that we're about to invalidate by putting more info into
+    // it than its valid cache info.  If empty, the result will be valid cache
+    // info, otherwise it isn't.
+    if (Cache->empty())
+      CacheInfo->Pair = BBSkipFirstBlockPair(StartBB, SkipFirstBlock);
+    else
+      CacheInfo->Pair = BBSkipFirstBlockPair();
+  }
 
   SmallVector<BasicBlock *, 32> Worklist;
   Worklist.push_back(StartBB);
@@ -1377,22 +1405,26 @@ bool MemoryDependenceResults::getNonLocalPointerDepFromBB(
     if (SkipFirstBlock)
       return false;
 
-    bool foundBlock = false;
-    for (NonLocalDepEntry &I : llvm::reverse(*Cache)) {
-      if (I.getBB() != BB)
-        continue;
+    // Results of invariant loads are not cached thus no need to update cached
+    // information.
+    if (!isInvariantLoad) {
+      bool foundBlock = false;
+      for (NonLocalDepEntry &I : llvm::reverse(*Cache)) {
+        if (I.getBB() != BB)
+          continue;
 
-      assert((GotWorklistLimit || I.getResult().isNonLocal() ||
-              !DT.isReachableFromEntry(BB)) &&
-             "Should only be here with transparent block");
-      foundBlock = true;
-      I.setResult(MemDepResult::getUnknown());
-      Result.push_back(
-          NonLocalDepResult(I.getBB(), I.getResult(), Pointer.getAddr()));
-      break;
+        assert((GotWorklistLimit || I.getResult().isNonLocal() ||
+                !DT.isReachableFromEntry(BB)) &&
+               "Should only be here with transparent block");
+        foundBlock = true;
+        I.setResult(MemDepResult::getUnknown());
+        Result.push_back(
+            NonLocalDepResult(I.getBB(), I.getResult(), Pointer.getAddr()));
+        break;
+      }
+      (void)foundBlock; (void)GotWorklistLimit;
+      assert((foundBlock || GotWorklistLimit) && "Current block not in cache?");
     }
-    (void)foundBlock; (void)GotWorklistLimit;
-    assert((foundBlock || GotWorklistLimit) && "Current block not in cache?");
   }
 
   // Okay, we're done now.  If we added new values to the cache, re-sort it.
diff --git a/llvm/test/Analysis/MemoryDependenceAnalysis/InvariantLoad.ll b/llvm/test/Analysis/MemoryDependenceAnalysis/InvariantLoad.ll
new file mode 100644
index 00000000000000..152cb175ef6082
--- /dev/null
+++ b/llvm/test/Analysis/MemoryDependenceAnalysis/InvariantLoad.ll
@@ -0,0 +1,173 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -gvn -S | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128-ni:1"
+target triple = "x86_64-unknown-linux-gnu"
+
+declare void @llvm.memset.p0i8.i8(i8*, i8, i32, i1)
+declare void @foo(i8*)
+
+define i8 @test(i1 %cmp) {
+; CHECK-LABEL: @test(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P:%.*]] = alloca i8
+; CHECK-NEXT:    store i8 5, i8* [[P]]
+; CHECK-NEXT:    br label [[HEADER:%.*]]
+; CHECK:       header:
+; CHECK-NEXT:    [[V:%.*]] = phi i8 [ 5, [[ENTRY:%.*]] ], [ -5, [[ALIVE:%.*]] ]
+; CHECK-NEXT:    [[I:%.*]] = phi i8 [ 0, [[ENTRY]] ], [ [[I_INC:%.*]], [[ALIVE]] ]
+; CHECK-NEXT:    br i1 [[CMP:%.*]], label [[ALIVE]], label [[DEAD:%.*]]
+; CHECK:       dead:
+; CHECK-NEXT:    call void @foo(i8* [[P]])
+; CHECK-NEXT:    [[I_1:%.*]] = add i8 [[I]], [[V]]
+; CHECK-NEXT:    br label [[ALIVE]]
+; CHECK:       alive:
+; CHECK-NEXT:    [[I_2:%.*]] = phi i8 [ [[I]], [[HEADER]] ], [ [[I_1]], [[DEAD]] ]
+; CHECK-NEXT:    store i8 -5, i8* [[P]]
+; CHECK-NEXT:    call void @llvm.memset.p0i8.i32(i8* align 1 [[P]], i8 0, i32 1, i1 false)
+; CHECK-NEXT:    [[I_INC]] = add i8 [[I_2]], 1
+; CHECK-NEXT:    [[CMP_LOOP:%.*]] = icmp ugt i8 [[I_INC]], 100
+; CHECK-NEXT:    br i1 [[CMP_LOOP]], label [[EXIT:%.*]], label [[HEADER]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret i8 0
+;
+
+entry:
+  %p = alloca i8
+  %addr = getelementptr inbounds i8, i8* %p, i64 0
+  store i8 5, i8* %addr
+  br label %header
+header:
+  %i = phi i8 [0, %entry], [%i.inc, %backedge]
+  br i1 %cmp, label %alive, label %dead
+dead:
+  call void @foo(i8* %p)
+  %v = load i8, i8* %addr, !invariant.load !1
+  %i.1 = add i8 %i, %v
+  br label %alive
+alive:
+  %i.2 = phi i8 [%i, %header], [%i.1, %dead]
+  store i8 -5, i8* %addr
+  br label %backedge
+backedge:
+  call void @llvm.memset.p0i8.i8(i8 * align 1 %p, i8 0, i32 1, i1 false)
+  %i.inc = add i8 %i.2, 1
+  %cmp.loop = icmp ugt i8 %i.inc, 100
+  br i1 %cmp.loop, label %exit, label %header
+exit:
+  %res = load i8, i8* %addr
+  ret i8 %res
+}
+
+; Check that first two loads are not optimized out while the one marked with
+; invariant.load reuses %res1
+define i8 @test2(i1 %cmp, i8 *%p) {
+; CHECK-LABEL: @test2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[RES1:%.*]] = load i8, i8* [[P:%.*]]
+; CHECK-NEXT:    call void @foo(i8* [[P]])
+; CHECK-NEXT:    br i1 [[CMP:%.*]], label [[B2:%.*]], label [[B1:%.*]]
+; CHECK:       b1:
+; CHECK-NEXT:    [[RES2:%.*]] = load i8, i8* [[P]]
+; CHECK-NEXT:    [[RES3:%.*]] = add i8 [[RES1]], [[RES2]]
+; CHECK-NEXT:    br label [[ALIVE:%.*]]
+; CHECK:       b2:
+; CHECK-NEXT:    [[RES_DEAD:%.*]] = add i8 [[RES1]], [[RES1]]
+; CHECK-NEXT:    br label [[ALIVE]]
+; CHECK:       alive:
+; CHECK-NEXT:    [[RES_PHI:%.*]] = phi i8 [ [[RES3]], [[B1]] ], [ [[RES_DEAD]], [[B2]] ]
+; CHECK-NEXT:    ret i8 [[RES_PHI]]
+;
+
+entry:
+  %res1 = load i8, i8* %p
+  call void @foo(i8 *%p)
+  br i1 %cmp, label %b2, label %b1
+b1:
+  %res2 = load i8, i8* %p
+  %res3 = add i8 %res1, %res2
+  br label %alive
+b2:
+  %v = load i8, i8* %p, !invariant.load !1
+  %res.dead = add i8 %v, %res1
+  br label %alive
+alive:
+  %res.phi = phi i8 [%res3, %b1], [%res.dead, %b2]
+  ret i8 %res.phi
+}
+
+; This is essentially the same test case as the above one but with %b1 and %b2
+; swapped in "br i1 %cmp, label %b1, label %b2" instruction. That helps us to
+; ensure that results doesn't depend on visiting order.
+define i8 @test3(i1 %cmp, i8 *%p) {
+; CHECK-LABEL: @test3(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[RES1:%.*]] = load i8, i8* [[P:%.*]]
+; CHECK-NEXT:    call void @foo(i8* [[P]])
+; CHECK-NEXT:    br i1 [[CMP:%.*]], label [[B1:%.*]], label [[B2:%.*]]
+; CHECK:       b1:
+; CHECK-NEXT:    [[RES2:%.*]] = load i8, i8* [[P]]
+; CHECK-NEXT:    [[RES3:%.*]] = add i8 [[RES1]], [[RES2]]
+; CHECK-NEXT:    br label [[ALIVE:%.*]]
+; CHECK:       b2:
+; CHECK-NEXT:    [[RES_DEAD:%.*]] = add i8 [[RES1]], [[RES1]]
+; CHECK-NEXT:    br label [[ALIVE]]
+; CHECK:       alive:
+; CHECK-NEXT:    [[RES_PHI:%.*]] = phi i8 [ [[RES3]], [[B1]] ], [ [[RES_DEAD]], [[B2]] ]
+; CHECK-NEXT:    ret i8 [[RES_PHI]]
+;
+entry:
+  %res1 = load i8, i8* %p
+  call void @foo(i8 *%p)
+  br i1 %cmp, label %b1, label %b2
+b1:
+  %res2 = load i8, i8* %p
+  %res3 = add i8 %res1, %res2
+  br label %alive
+b2:
+  %v = load i8, i8* %p, !invariant.load !1
+  %res.dead = add i8 %v, %res1
+  br label %alive
+alive:
+  %res.phi = phi i8 [%res3, %b1], [%res.dead, %b2]
+  ret i8 %res.phi
+}
+
+
+; This is reduced test case catching regression in the first version of the
+; fix for invariant loads (https://reviews.llvm.org/D64405).
+define void @test4() {
+; CHECK-LABEL: @test4(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* inttoptr (i64 8 to float*), align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = fmul float [[TMP0]], [[TMP0]]
+; CHECK-NEXT:    br label [[FUSION_LOOP_HEADER_DIM_1_PREHEADER:%.*]]
+; CHECK:       fusion.loop_header.dim.1.preheader:
+; CHECK-NEXT:    [[TMP2:%.*]] = phi float [ [[TMP0]], [[ENTRY:%.*]] ], [ [[DOTPRE:%.*]], [[FUSION_LOOP_HEADER_DIM_1_PREHEADER]] ]
+; CHECK-NEXT:    [[FUSION_INVAR_ADDRESS_DIM_0_03:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[INVAR_INC3:%.*]], [[FUSION_LOOP_HEADER_DIM_1_PREHEADER]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [2 x [1 x [4 x float]]], [2 x [1 x [4 x float]]]* null, i64 0, i64 [[FUSION_INVAR_ADDRESS_DIM_0_03]], i64 0, i64 2
+; CHECK-NEXT:    [[TMP4:%.*]] = fmul float [[TMP2]], [[TMP2]]
+; CHECK-NEXT:    [[INVAR_INC3]] = add nuw nsw i64 [[FUSION_INVAR_ADDRESS_DIM_0_03]], 1
+; CHECK-NEXT:    [[DOTPHI_TRANS_INSERT:%.*]] = getelementptr inbounds [2 x [1 x [4 x float]]], [2 x [1 x [4 x float]]]* null, i64 0, i64 [[INVAR_INC3]], i64 0, i64 2
+; CHECK-NEXT:    [[DOTPRE]] = load float, float* [[DOTPHI_TRANS_INSERT]], align 4, !invariant.load !0
+; CHECK-NEXT:    br label [[FUSION_LOOP_HEADER_DIM_1_PREHEADER]]
+;
+entry:
+  %0 = getelementptr inbounds [2 x [1 x [4 x float]]], [2 x [1 x [4 x float]]]* null, i64 0, i64 0, i64 0, i64 2
+  %1 = load float, float* %0, align 4
+  %2 = fmul float %1, %1
+  br label %fusion.loop_header.dim.1.preheader
+
+fusion.loop_header.dim.1.preheader:               ; preds = %fusion.loop_header.dim.1.preheader, %entry
+  %fusion.invar_address.dim.0.03 = phi i64 [ 0, %entry ], [ %invar.inc3, %fusion.loop_header.dim.1.preheader ]
+  %3 = getelementptr inbounds [2 x [1 x [4 x float]]], [2 x [1 x [4 x float]]]* null, i64 0, i64 %fusion.invar_address.dim.0.03, i64 0, i64 2
+  %4 = load float, float* %3, align 4, !invariant.load !1
+  %5 = fmul float %4, %4
+  %6 = getelementptr inbounds [2 x [1 x [4 x float]]], [2 x [1 x [4 x float]]]* null, i64 0, i64 %fusion.invar_address.dim.0.03, i64 0, i64 2
+  %7 = load float, float* %6, align 4, !invariant.load !1
+  %8 = fmul float %7, %7
+  %invar.inc3 = add nuw nsw i64 %fusion.invar_address.dim.0.03, 1
+  br label %fusion.loop_header.dim.1.preheader
+}
+
+!1 = !{}