diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index bb95265a794a0..73ec607014d31 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -122,6 +122,7 @@ class AMDGPUPromoteAllocaImpl {
   /// Check whether we have enough local memory for promotion.
   bool hasSufficientLocalMem(const Function &F);
 
+  FixedVectorType *getVectorTypeForAlloca(Type *AllocaTy) const;
   bool tryPromoteAllocaToVector(AllocaInst &I);
   bool tryPromoteAllocaToLDS(AllocaInst &I, bool SufficientLDS);
 
@@ -501,27 +502,14 @@ static Value *promoteAllocaUserToVector(
     Instruction *Inst, const DataLayout &DL, FixedVectorType *VectorTy,
     unsigned VecStoreSize, unsigned ElementSize,
     DenseMap<MemTransferInst *, MemTransferInfo> &TransferInfo,
-    std::map<GetElementPtrInst *, WeakTrackingVH> &GEPVectorIdx, Value *CurVal,
-    SmallVectorImpl<LoadInst *> &DeferredLoads) {
+    std::map<GetElementPtrInst *, WeakTrackingVH> &GEPVectorIdx,
+    function_ref<Value *()> GetCurVal) {
   // Note: we use InstSimplifyFolder because it can leverage the DataLayout
   // to do more folding, especially in the case of vector splats.
   IRBuilder<InstSimplifyFolder> Builder(Inst->getContext(),
                                         InstSimplifyFolder(DL));
   Builder.SetInsertPoint(Inst);
 
-  const auto GetOrLoadCurrentVectorValue = [&]() -> Value * {
-    if (CurVal)
-      return CurVal;
-
-    // If the current value is not known, insert a dummy load and lower it on
-    // the second pass.
-    LoadInst *Dummy =
-        Builder.CreateLoad(VectorTy, PoisonValue::get(Builder.getPtrTy()),
-                           "promotealloca.dummyload");
-    DeferredLoads.push_back(Dummy);
-    return Dummy;
-  };
-
   const auto CreateTempPtrIntCast = [&Builder, DL](Value *Val,
                                                    Type *PtrTy) -> Value * {
     assert(DL.getTypeStoreSize(Val->getType()) == DL.getTypeStoreSize(PtrTy));
@@ -541,12 +529,7 @@ static Value *promoteAllocaUserToVector(
 
   switch (Inst->getOpcode()) {
   case Instruction::Load: {
-    // Loads can only be lowered if the value is known.
-    if (!CurVal) {
-      DeferredLoads.push_back(cast<LoadInst>(Inst));
-      return nullptr;
-    }
-
+    Value *CurVal = GetCurVal();
     Value *Index = calculateVectorIndex(
         cast<LoadInst>(Inst)->getPointerOperand(), GEPVectorIdx);
 
@@ -636,7 +619,7 @@ static Value *promoteAllocaUserToVector(
 
       Val = Builder.CreateBitOrPointerCast(Val, SubVecTy);
 
-      Value *CurVec = GetOrLoadCurrentVectorValue();
+      Value *CurVec = GetCurVal();
       for (unsigned K = 0, NumElts = std::min(NumWrittenElts, NumVecElts);
            K < NumElts; ++K) {
         Value *CurIdx =
@@ -649,8 +632,7 @@ static Value *promoteAllocaUserToVector(
 
     if (Val->getType() != VecEltTy)
       Val = Builder.CreateBitOrPointerCast(Val, VecEltTy);
-    return Builder.CreateInsertElement(GetOrLoadCurrentVectorValue(), Val,
-                                       Index);
+    return Builder.CreateInsertElement(GetCurVal(), Val, Index);
   }
   case Instruction::Call: {
     if (auto *MTI = dyn_cast<MemTransferInst>(Inst)) {
@@ -672,7 +654,7 @@ static Value *promoteAllocaUserToVector(
         }
       }
 
-      return Builder.CreateShuffleVector(GetOrLoadCurrentVectorValue(), Mask);
+      return Builder.CreateShuffleVector(GetCurVal(), Mask);
     }
 
     if (auto *MSI = dyn_cast<MemSetInst>(Inst)) {
@@ -791,16 +773,13 @@ static BasicBlock::iterator skipToNonAllocaInsertPt(BasicBlock &BB,
   return I;
 }
 
-// FIXME: Should try to pick the most likely to be profitable allocas first.
-bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
-  LLVM_DEBUG(dbgs() << "Trying to promote to vector: " << Alloca << '\n');
-
+FixedVectorType *
+AMDGPUPromoteAllocaImpl::getVectorTypeForAlloca(Type *AllocaTy) const {
   if (DisablePromoteAllocaToVector) {
-    LLVM_DEBUG(dbgs() << "  Promote alloca to vector is disabled\n");
-    return false;
+    LLVM_DEBUG(dbgs() << "  Promote alloca to vectors is disabled\n");
+    return nullptr;
   }
 
-  Type *AllocaTy = Alloca.getAllocatedType();
   auto *VectorTy = dyn_cast<FixedVectorType>(AllocaTy);
   if (auto *ArrayTy = dyn_cast<ArrayType>(AllocaTy)) {
     uint64_t NumElems = 1;
@@ -832,10 +811,9 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
       }
     }
   }
-
   if (!VectorTy) {
     LLVM_DEBUG(dbgs() << "  Cannot convert type to vector\n");
-    return false;
+    return nullptr;
   }
 
   const unsigned MaxElements =
@@ -845,9 +823,29 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
       VectorTy->getNumElements() < 2) {
     LLVM_DEBUG(dbgs() << "  " << *VectorTy
                       << " has an unsupported number of elements\n");
-    return false;
+    return nullptr;
+  }
+
+  Type *VecEltTy = VectorTy->getElementType();
+  unsigned ElementSizeInBits = DL->getTypeSizeInBits(VecEltTy);
+  if (ElementSizeInBits != DL->getTypeAllocSizeInBits(VecEltTy)) {
+    LLVM_DEBUG(dbgs() << "  Cannot convert to vector if the allocation size "
+                         "does not match the type's size\n");
+    return nullptr;
   }
 
+  return VectorTy;
+}
+
+// FIXME: Should try to pick the most likely to be profitable allocas first.
+bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
+  LLVM_DEBUG(dbgs() << "Trying to promote to vectors: " << Alloca << '\n');
+
+  Type *AllocaTy = Alloca.getAllocatedType();
+  FixedVectorType *VectorTy = getVectorTypeForAlloca(AllocaTy);
+  if (!VectorTy)
+    return false;
+
   std::map<GetElementPtrInst *, WeakTrackingVH> GEPVectorIdx;
   SmallVector<Instruction *> WorkList;
   SmallVector<Instruction *> UsersToRemove;
@@ -869,13 +867,7 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
   LLVM_DEBUG(dbgs() << "  Attempting promotion to: " << *VectorTy << "\n");
 
   Type *VecEltTy = VectorTy->getElementType();
-  unsigned ElementSizeInBits = DL->getTypeSizeInBits(VecEltTy);
-  if (ElementSizeInBits != DL->getTypeAllocSizeInBits(VecEltTy)) {
-    LLVM_DEBUG(dbgs() << "  Cannot convert to vector if the allocation size "
-                         "does not match the type's size\n");
-    return false;
-  }
-  unsigned ElementSize = ElementSizeInBits / 8;
+  unsigned ElementSize = DL->getTypeSizeInBits(VecEltTy) / 8;
   assert(ElementSize > 0);
   for (auto *U : Uses) {
     Instruction *Inst = cast<Instruction>(U->getUser());
@@ -1027,37 +1019,44 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
 
   Updater.AddAvailableValue(EntryBB, AllocaInitValue);
 
-  // First handle the initial worklist.
-  SmallVector<LoadInst *, 4> DeferredLoads;
+  // First handle the initial worklist, in basic block order.
+  //
+  // Insert a placeholder whenever we need the vector value at the top of a
+  // basic block.
+  SmallVector<Instruction *> Placeholders;
   forEachWorkListItem(WorkList, [&](Instruction *I) {
     BasicBlock *BB = I->getParent();
-    // On the first pass, we only take values that are trivially known, i.e.
-    // where AddAvailableValue was already called in this block.
-    Value *Result = promoteAllocaUserToVector(
-        I, *DL, VectorTy, VecStoreSize, ElementSize, TransferInfo, GEPVectorIdx,
-        Updater.FindValueForBlock(BB), DeferredLoads);
+    auto GetCurVal = [&]() -> Value * {
+      if (Value *CurVal = Updater.FindValueForBlock(BB))
+        return CurVal;
+
+      // If the current value in the basic block is not yet known, insert a
+      // placeholder that we will replace later.
+      IRBuilder<> Builder(I);
+      auto *Placeholder = cast<Instruction>(Builder.CreateFreeze(
+          PoisonValue::get(VectorTy), "promotealloca.placeholder"));
+      Placeholders.push_back(Placeholder);
+      Updater.AddAvailableValue(BB, Placeholder);
+      return Placeholder;
+    };
+
+    Value *Result =
+        promoteAllocaUserToVector(I, *DL, VectorTy, VecStoreSize, ElementSize,
+                                  TransferInfo, GEPVectorIdx, GetCurVal);
     if (Result)
       Updater.AddAvailableValue(BB, Result);
   });
 
-  // Then handle deferred loads.
-  forEachWorkListItem(DeferredLoads, [&](Instruction *I) {
-    SmallVector<LoadInst *, 0> NewDLs;
-    BasicBlock *BB = I->getParent();
-    // On the second pass, we use GetValueInMiddleOfBlock to guarantee we always
-    // get a value, inserting PHIs as needed.
-    Value *Result = promoteAllocaUserToVector(
-        I, *DL, VectorTy, VecStoreSize, ElementSize, TransferInfo, GEPVectorIdx,
-        Updater.GetValueInMiddleOfBlock(I->getParent()), NewDLs);
-    if (Result)
-      Updater.AddAvailableValue(BB, Result);
-    assert(NewDLs.empty() && "No more deferred loads should be queued!");
-  });
+  // Now fixup the placeholders.
+  for (Instruction *Placeholder : Placeholders) {
+    Placeholder->replaceAllUsesWith(
+        Updater.GetValueInMiddleOfBlock(Placeholder->getParent()));
+    Placeholder->eraseFromParent();
+  }
 
   // Delete all instructions. On the first pass, new dummy loads may have been
   // added so we need to collect them too.
   DenseSet<Instruction *> InstsToDelete(WorkList.begin(), WorkList.end());
-  InstsToDelete.insert_range(DeferredLoads);
   for (Instruction *I : InstsToDelete) {
     assert(I->use_empty());
     I->eraseFromParent();
diff --git a/llvm/test/CodeGen/AMDGPU/vector-alloca-atomic.ll b/llvm/test/CodeGen/AMDGPU/vector-alloca-atomic.ll
index 8e4cc2b0236c0..a7090960518af 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-alloca-atomic.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-alloca-atomic.ll
@@ -1,11 +1,11 @@
-; RUN: opt -S -mtriple=amdgcn-- -data-layout=A5 -passes='amdgpu-promote-alloca,sroa,instcombine' < %s | FileCheck -check-prefix=OPT %s
+; RUN: opt -S -mtriple=amdgcn-- -passes='amdgpu-promote-alloca,sroa,instcombine' < %s | FileCheck -check-prefix=OPT %s
 
 ; Show that what the alloca promotion pass will do for non-atomic load/store.
 
 ; OPT-LABEL: @vector_alloca_not_atomic(
 ;
-; OPT: extractelement <3 x i32> <i32 0, i32 1, i32 2>, i64 %index
-define amdgpu_kernel void @vector_alloca_not_atomic(ptr addrspace(1) %out, i64 %index) {
+; OPT: extractelement <3 x i32> <i32 0, i32 1, i32 2>, i32 %index
+define amdgpu_kernel void @vector_alloca_not_atomic(ptr addrspace(1) %out, i32 %index) {
 entry:
   %alloca = alloca [3 x i32], addrspace(5)
   %a1 = getelementptr [3 x i32], ptr addrspace(5) %alloca, i32 0, i32 1
@@ -13,7 +13,7 @@ entry:
   store i32 0, ptr addrspace(5) %alloca
   store i32 1, ptr addrspace(5) %a1
   store i32 2, ptr addrspace(5) %a2
-  %tmp = getelementptr [3 x i32], ptr addrspace(5) %alloca, i64 0, i64 %index
+  %tmp = getelementptr [3 x i32], ptr addrspace(5) %alloca, i32 0, i32 %index
   %data = load i32, ptr addrspace(5) %tmp
   store i32 %data, ptr addrspace(1) %out
   ret void
@@ -26,7 +26,7 @@ entry:
 ; OPT: store i32 1, ptr addrspace(5)
 ; OPT: store i32 2, ptr addrspace(5)
 ; OPT: load atomic i32, ptr addrspace(5)
-define amdgpu_kernel void @vector_alloca_atomic_read(ptr addrspace(1) %out, i64 %index) {
+define amdgpu_kernel void @vector_alloca_atomic_read(ptr addrspace(1) %out, i32 %index) {
 entry:
   %alloca = alloca [3 x i32], addrspace(5)
   %a1 = getelementptr [3 x i32], ptr addrspace(5) %alloca, i32 0, i32 1
@@ -34,7 +34,7 @@ entry:
   store i32 0, ptr addrspace(5) %alloca
   store i32 1, ptr addrspace(5) %a1
   store i32 2, ptr addrspace(5) %a2
-  %tmp = getelementptr [3 x i32], ptr addrspace(5) %alloca, i64 0, i64 %index
+  %tmp = getelementptr [3 x i32], ptr addrspace(5) %alloca, i32 0, i32 %index
   %data = load atomic i32, ptr addrspace(5) %tmp acquire, align 4
   store i32 %data, ptr addrspace(1) %out
   ret void
@@ -47,7 +47,7 @@ entry:
 ; OPT: store atomic i32 1, ptr addrspace(5)
 ; OPT: store atomic i32 2, ptr addrspace(5)
 ; OPT: load i32, ptr addrspace(5)
-define amdgpu_kernel void @vector_alloca_atomic_write(ptr addrspace(1) %out, i64 %index) {
+define amdgpu_kernel void @vector_alloca_atomic_write(ptr addrspace(1) %out, i32 %index) {
 entry:
   %alloca = alloca [3 x i32], addrspace(5)
   %a1 = getelementptr [3 x i32], ptr addrspace(5) %alloca, i32 0, i32 1
@@ -55,7 +55,7 @@ entry:
   store atomic i32 0, ptr addrspace(5) %alloca release, align 4
   store atomic i32 1, ptr addrspace(5) %a1 release, align 4
   store atomic i32 2, ptr addrspace(5) %a2  release, align 4
-  %tmp = getelementptr [3 x i32], ptr addrspace(5) %alloca, i64 0, i64 %index
+  %tmp = getelementptr [3 x i32], ptr addrspace(5) %alloca, i32 0, i32 %index
   %data = load i32, ptr addrspace(5) %tmp
   store i32 %data, ptr addrspace(1) %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll b/llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll
index 9c05f4d16cb4e..4a29f7e53e93a 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll
@@ -72,7 +72,8 @@ entry:
 ; OPT-NOT:   alloca
 ; OPT: bb2:
 ; OPT:  %promotealloca = phi <6 x float> [ zeroinitializer, %bb ], [ %0, %bb2 ]
-; OPT:  %0 = insertelement <6 x float> %promotealloca, float %tmp71, i32 %tmp10
+; OPT: [[TMP:%tmp7.*]] = load float, ptr addrspace(1) %tmp5, align 4
+; OPT:  %0 = insertelement <6 x float> %promotealloca, float [[TMP]], i32 %tmp10
 ; OPT: .preheader:
 ; OPT:  %bc = bitcast <6 x float> %0 to <6 x i32>
 ; OPT:  %1 = extractelement <6 x i32> %bc, i32 %tmp20
@@ -132,7 +133,8 @@ bb15:                                             ; preds = %.preheader
 ; OPT-NOT:   alloca
 ; OPT: bb2:
 ; OPT:  %promotealloca = phi <6 x double> [ zeroinitializer, %bb ], [ %0, %bb2 ]
-; OPT:  %0 = insertelement <6 x double> %promotealloca, double %tmp71, i32 %tmp10
+; OPT:  [[TMP:%tmp7.*]] = load double, ptr addrspace(1) %tmp5, align 8
+; OPT:  %0 = insertelement <6 x double> %promotealloca, double [[TMP]], i32 %tmp10
 ; OPT: .preheader:
 ; OPT:  %bc = bitcast <6 x double> %0 to <6 x i64>
 ; OPT:  %1 = extractelement <6 x i64> %bc, i32 %tmp20