diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index bb95265a794a0..f431535c722ef 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -85,6 +85,42 @@ static cl::opt<unsigned>
                             "when sorting profitable allocas"),
                    cl::init(4));
 
+// We support vector indices of the form (A * stride) + B
+// All parts are optional.
+struct GEPToVectorIndex {
+  Value *VarIndex = nullptr;         // defaults to 0
+  ConstantInt *VarMul = nullptr;     // defaults to 1
+  ConstantInt *ConstIndex = nullptr; // defaults to 0
+  Value *Full = nullptr;
+};
+
+struct MemTransferInfo {
+  ConstantInt *SrcIndex = nullptr;
+  ConstantInt *DestIndex = nullptr;
+};
+
+// Analysis for planning the different strategies of alloca promotion.
+struct AllocaAnalysis {
+  AllocaInst *Alloca = nullptr;
+  SmallVector<Value *> Pointers;
+  SmallVector<Use *> Uses;
+  unsigned Score = 0;
+  bool HaveSelectOrPHI = false;
+  struct {
+    FixedVectorType *Ty = nullptr;
+    SmallVector<Instruction *> Worklist;
+    SmallVector<Instruction *> UsersToRemove;
+    MapVector<GetElementPtrInst *, GEPToVectorIndex> GEPVectorIdx;
+    MapVector<MemTransferInst *, MemTransferInfo> TransferInfo;
+  } Vector;
+  struct {
+    bool Enable = false;
+    SmallVector<User *> Worklist;
+  } LDS;
+
+  explicit AllocaAnalysis(AllocaInst *Alloca) : Alloca(Alloca) {}
+};
+
 // Shared implementation which can do both promotion to vector and to LDS.
 class AMDGPUPromoteAllocaImpl {
 private:
@@ -106,10 +142,7 @@ class AMDGPUPromoteAllocaImpl {
   std::pair<Value *, Value *> getLocalSizeYZ(IRBuilder<> &Builder);
   Value *getWorkitemID(IRBuilder<> &Builder, unsigned N);
 
-  /// BaseAlloca is the alloca root the search started from.
-  /// Val may be that alloca or a recursive user of it.
-  bool collectUsesWithPtrTypes(Value *BaseAlloca, Value *Val,
-                               std::vector<Value *> &WorkList) const;
+  bool collectAllocaUses(AllocaAnalysis &AA) const;
 
   /// Val is a derived pointer from Alloca. OpIdx0/OpIdx1 are the operand
   /// indices to an instruction with 2 pointer inputs (e.g. select, icmp).
@@ -122,10 +155,13 @@ class AMDGPUPromoteAllocaImpl {
   /// Check whether we have enough local memory for promotion.
   bool hasSufficientLocalMem(const Function &F);
 
-  bool tryPromoteAllocaToVector(AllocaInst &I);
-  bool tryPromoteAllocaToLDS(AllocaInst &I, bool SufficientLDS);
+  FixedVectorType *getVectorTypeForAlloca(Type *AllocaTy) const;
+  void analyzePromoteToVector(AllocaAnalysis &AA) const;
+  void promoteAllocaToVector(AllocaAnalysis &AA);
+  void analyzePromoteToLDS(AllocaAnalysis &AA) const;
+  bool tryPromoteAllocaToLDS(AllocaAnalysis &AA, bool SufficientLDS);
 
-  void sortAllocasToPromote(SmallVectorImpl<AllocaInst *> &Allocas);
+  void scoreAlloca(AllocaAnalysis &AA) const;
 
   void setFunctionLimits(const Function &F);
 
@@ -236,53 +272,77 @@ FunctionPass *llvm::createAMDGPUPromoteAlloca() {
   return new AMDGPUPromoteAlloca();
 }
 
-static void collectAllocaUses(AllocaInst &Alloca,
-                              SmallVectorImpl<Use *> &Uses) {
-  SmallVector<Instruction *, 4> WorkList({&Alloca});
+bool AMDGPUPromoteAllocaImpl::collectAllocaUses(AllocaAnalysis &AA) const {
+  const auto RejectUser = [&](Instruction *Inst, Twine Msg) {
+    LLVM_DEBUG(dbgs() << "  Cannot promote alloca: " << Msg << "\n"
+                      << "    " << *Inst << "\n");
+    return false;
+  };
+
+  SmallVector<Instruction *, 4> WorkList({AA.Alloca});
   while (!WorkList.empty()) {
     auto *Cur = WorkList.pop_back_val();
+    if (find(AA.Pointers, Cur) != AA.Pointers.end())
+      continue;
+    AA.Pointers.push_back(Cur);
     for (auto &U : Cur->uses()) {
-      Uses.push_back(&U);
+      auto *Inst = cast<Instruction>(U.getUser());
+      if (isa<StoreInst>(Inst)) {
+        if (U.getOperandNo() != StoreInst::getPointerOperandIndex()) {
+          return RejectUser(Inst, "pointer escapes via store");
+        }
+      }
+      AA.Uses.push_back(&U);
 
-      if (isa<GetElementPtrInst>(U.getUser()))
-        WorkList.push_back(cast<Instruction>(U.getUser()));
+      if (isa<GetElementPtrInst>(U.getUser())) {
+        WorkList.push_back(Inst);
+      } else if (auto *SI = dyn_cast<SelectInst>(Inst)) {
+        // Only promote a select if we know that the other select operand is
+        // from another pointer that will also be promoted.
+        if (!binaryOpIsDerivedFromSameAlloca(AA.Alloca, Cur, SI, 1, 2))
+          return RejectUser(Inst, "select from mixed objects");
+        WorkList.push_back(Inst);
+        AA.HaveSelectOrPHI = true;
+      } else if (auto *Phi = dyn_cast<PHINode>(Inst)) {
+        // Repeat for phis.
+
+        // TODO: Handle more complex cases. We should be able to replace loops
+        // over arrays.
+        switch (Phi->getNumIncomingValues()) {
+        case 1:
+          break;
+        case 2:
+          if (!binaryOpIsDerivedFromSameAlloca(AA.Alloca, Cur, Phi, 0, 1))
+            return RejectUser(Inst, "phi from mixed objects");
+          break;
+        default:
+          return RejectUser(Inst, "phi with too many operands");
+        }
+
+        WorkList.push_back(Inst);
+        AA.HaveSelectOrPHI = true;
+      }
     }
   }
+  return true;
 }
 
-void AMDGPUPromoteAllocaImpl::sortAllocasToPromote(
-    SmallVectorImpl<AllocaInst *> &Allocas) {
-  DenseMap<AllocaInst *, unsigned> Scores;
-
-  for (auto *Alloca : Allocas) {
-    LLVM_DEBUG(dbgs() << "Scoring: " << *Alloca << "\n");
-    unsigned &Score = Scores[Alloca];
-    // Increment score by one for each user + a bonus for users within loops.
-    SmallVector<Use *, 8> Uses;
-    collectAllocaUses(*Alloca, Uses);
-    for (auto *U : Uses) {
-      Instruction *Inst = cast<Instruction>(U->getUser());
-      if (isa<GetElementPtrInst>(Inst))
-        continue;
-      unsigned UserScore =
-          1 + (LoopUserWeight * LI.getLoopDepth(Inst->getParent()));
-      LLVM_DEBUG(dbgs() << "  [+" << UserScore << "]:\t" << *Inst << "\n");
-      Score += UserScore;
-    }
-    LLVM_DEBUG(dbgs() << "  => Final Score:" << Score << "\n");
+void AMDGPUPromoteAllocaImpl::scoreAlloca(AllocaAnalysis &AA) const {
+  LLVM_DEBUG(dbgs() << "Scoring: " << *AA.Alloca << "\n");
+  unsigned Score = 0;
+  // Increment score by one for each user + a bonus for users within loops.
+  for (auto *U : AA.Uses) {
+    Instruction *Inst = cast<Instruction>(U->getUser());
+    if (isa<GetElementPtrInst>(Inst) || isa<SelectInst>(Inst) ||
+        isa<PHINode>(Inst))
+      continue;
+    unsigned UserScore =
+        1 + (LoopUserWeight * LI.getLoopDepth(Inst->getParent()));
+    LLVM_DEBUG(dbgs() << "  [+" << UserScore << "]:\t" << *Inst << "\n");
+    Score += UserScore;
   }
-
-  stable_sort(Allocas, [&](AllocaInst *A, AllocaInst *B) {
-    return Scores.at(A) > Scores.at(B);
-  });
-
-  // clang-format off
-  LLVM_DEBUG(
-    dbgs() << "Sorted Worklist:\n";
-    for (auto *A: Allocas)
-      dbgs() << "  " << *A << "\n";
-  );
-  // clang-format on
+  LLVM_DEBUG(dbgs() << "  => Final Score:" << Score << "\n");
+  AA.Score = Score;
 }
 
 void AMDGPUPromoteAllocaImpl::setFunctionLimits(const Function &F) {
@@ -319,27 +379,48 @@ bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) {
                                   : (MaxVGPRs * 32)) /
       VGPRBudgetRatio;
 
-  SmallVector<AllocaInst *, 16> Allocas;
+  std::vector<AllocaAnalysis> Allocas;
   for (Instruction &I : F.getEntryBlock()) {
     if (AllocaInst *AI = dyn_cast<AllocaInst>(&I)) {
       // Array allocations are probably not worth handling, since an allocation
       // of the array type is the canonical form.
       if (!AI->isStaticAlloca() || AI->isArrayAllocation())
         continue;
-      Allocas.push_back(AI);
+
+      LLVM_DEBUG(dbgs() << "Analyzing: " << *AI << '\n');
+
+      AllocaAnalysis AA{AI};
+      if (collectAllocaUses(AA)) {
+        analyzePromoteToVector(AA);
+        if (PromoteToLDS)
+          analyzePromoteToLDS(AA);
+        if (AA.Vector.Ty || AA.LDS.Enable) {
+          scoreAlloca(AA);
+          Allocas.push_back(std::move(AA));
+        }
+      }
     }
   }
 
-  sortAllocasToPromote(Allocas);
+  stable_sort(Allocas,
+              [](const auto &A, const auto &B) { return A.Score > B.Score; });
+
+  // clang-format off
+  LLVM_DEBUG(
+    dbgs() << "Sorted Worklist:\n";
+    for (const auto &AA : Allocas)
+      dbgs() << "  " << *AA.Alloca << "\n";
+  );
+  // clang-format on
 
   bool Changed = false;
-  for (AllocaInst *AI : Allocas) {
-    const unsigned AllocaCost = DL->getTypeSizeInBits(AI->getAllocatedType());
-    // First, check if we have enough budget to vectorize this alloca.
-    if (AllocaCost <= VectorizationBudget) {
-      // If we do, attempt vectorization, otherwise, fall through and try
-      // promoting to LDS instead.
-      if (tryPromoteAllocaToVector(*AI)) {
+  for (AllocaAnalysis &AA : Allocas) {
+    if (AA.Vector.Ty) {
+      const unsigned AllocaCost =
+          DL->getTypeSizeInBits(AA.Alloca->getAllocatedType());
+      // First, check if we have enough budget to vectorize this alloca.
+      if (AllocaCost <= VectorizationBudget) {
+        promoteAllocaToVector(AA);
         Changed = true;
         assert((VectorizationBudget - AllocaCost) < VectorizationBudget &&
                "Underflow!");
@@ -347,14 +428,14 @@ bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) {
         LLVM_DEBUG(dbgs() << "  Remaining vectorization budget:"
                           << VectorizationBudget << "\n");
         continue;
+      } else {
+        LLVM_DEBUG(dbgs() << "Alloca too big for vectorization (size:"
+                          << AllocaCost << ", budget:" << VectorizationBudget
+                          << "): " << *AA.Alloca << "\n");
       }
-    } else {
-      LLVM_DEBUG(dbgs() << "Alloca too big for vectorization (size:"
-                        << AllocaCost << ", budget:" << VectorizationBudget
-                        << "): " << *AI << "\n");
     }
 
-    if (PromoteToLDS && tryPromoteAllocaToLDS(*AI, SufficientLDS))
+    if (AA.LDS.Enable && tryPromoteAllocaToLDS(AA, SufficientLDS))
       Changed = true;
   }
 
@@ -365,11 +446,6 @@ bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) {
   return Changed;
 }
 
-struct MemTransferInfo {
-  ConstantInt *SrcIndex = nullptr;
-  ConstantInt *DestIndex = nullptr;
-};
-
 // Checks if the instruction I is a memset user of the alloca AI that we can
 // deal with. Currently, only non-volatile memsets that affect the whole alloca
 // are handled.
@@ -387,23 +463,49 @@ static bool isSupportedMemset(MemSetInst *I, AllocaInst *AI,
          match(I->getOperand(2), m_SpecificInt(Size)) && !I->isVolatile();
 }
 
-static Value *calculateVectorIndex(
-    Value *Ptr, const std::map<GetElementPtrInst *, WeakTrackingVH> &GEPIdx) {
-  auto *GEP = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts());
-  if (!GEP)
-    return ConstantInt::getNullValue(Type::getInt32Ty(Ptr->getContext()));
+static Value *calculateVectorIndex(Value *Ptr, AllocaAnalysis &AA) {
+  IRBuilder<> B(Ptr->getContext());
+
+  Ptr = Ptr->stripPointerCasts();
+  if (Ptr == AA.Alloca)
+    return B.getInt32(0);
+
+  auto *GEP = cast<GetElementPtrInst>(Ptr);
+  auto I = AA.Vector.GEPVectorIdx.find(GEP);
+  assert(I != AA.Vector.GEPVectorIdx.end() && "Must have entry for GEP!");
 
-  auto I = GEPIdx.find(GEP);
-  assert(I != GEPIdx.end() && "Must have entry for GEP!");
+  if (!I->second.Full) {
+    Value *Result = nullptr;
+    B.SetInsertPoint(GEP);
+
+    if (I->second.VarIndex) {
+      Result = I->second.VarIndex;
+      Result = B.CreateSExtOrTrunc(Result, B.getInt32Ty());
+
+      if (I->second.VarMul)
+        Result = B.CreateMul(Result, I->second.VarMul);
+    }
 
-  Value *IndexValue = I->second;
-  assert(IndexValue && "index value missing from GEP index map");
-  return IndexValue;
+    if (I->second.ConstIndex) {
+      if (Result) {
+        Result = B.CreateAdd(Result, I->second.ConstIndex);
+      } else {
+        Result = I->second.ConstIndex;
+      }
+    }
+
+    if (!Result)
+      Result = B.getInt32(0);
+
+    I->second.Full = Result;
+  }
+
+  return I->second.Full;
 }
 
-static Value *GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca,
-                               Type *VecElemTy, const DataLayout &DL,
-                               SmallVector<Instruction *> &NewInsts) {
+static std::optional<GEPToVectorIndex>
+computeGEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca,
+                        Type *VecElemTy, const DataLayout &DL) {
   // TODO: Extracting a "multiple of X" from a GEP might be a useful generic
   // helper.
   LLVMContext &Ctx = GEP->getContext();
@@ -431,7 +533,7 @@ static Value *GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca,
   Value *CurPtr = GEP;
   while (auto *CurGEP = dyn_cast<GetElementPtrInst>(CurPtr)) {
     if (!CurGEP->collectOffset(DL, BW, VarOffsets, ConstOffset))
-      return nullptr;
+      return {};
 
     // Move to the next outer pointer.
     CurPtr = CurGEP->getPointerOperand();
@@ -441,87 +543,64 @@ static Value *GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca,
 
   int64_t VecElemSize = DL.getTypeAllocSize(VecElemTy);
   if (VarOffsets.size() > 1)
-    return nullptr;
+    return {};
 
   APInt IndexQuot;
   int64_t Rem;
   APInt::sdivrem(ConstOffset, VecElemSize, IndexQuot, Rem);
   if (Rem != 0)
-    return nullptr;
-  if (VarOffsets.size() == 0)
-    return ConstantInt::get(Ctx, IndexQuot);
+    return {};
+
+  GEPToVectorIndex Result;
 
-  IRBuilder<> Builder(GEP);
+  if (!ConstOffset.isZero())
+    Result.ConstIndex = ConstantInt::get(Ctx, IndexQuot.sextOrTrunc(BW));
+
+  if (VarOffsets.empty())
+    return Result;
 
   const auto &VarOffset = VarOffsets.front();
   APInt OffsetQuot;
   APInt::sdivrem(VarOffset.second, VecElemSize, OffsetQuot, Rem);
   if (Rem != 0 || OffsetQuot.isZero())
-    return nullptr;
+    return {};
 
-  Value *Offset = VarOffset.first;
-  auto *OffsetType = dyn_cast<IntegerType>(Offset->getType());
+  Result.VarIndex = VarOffset.first;
+  auto *OffsetType = dyn_cast<IntegerType>(Result.VarIndex->getType());
   if (!OffsetType)
-    return nullptr;
+    return {};
 
   if (!OffsetQuot.isOne()) {
-    ConstantInt *ConstMul =
-        ConstantInt::get(Ctx, OffsetQuot.sext(OffsetType->getBitWidth()));
-    Offset = Builder.CreateMul(Offset, ConstMul);
-    if (Instruction *NewInst = dyn_cast<Instruction>(Offset))
-      NewInsts.push_back(NewInst);
+    Result.VarMul = ConstantInt::get(Ctx, OffsetQuot.sextOrTrunc(BW));
   }
-  if (ConstOffset.isZero())
-    return Offset;
-
-  ConstantInt *ConstIndex =
-      ConstantInt::get(Ctx, IndexQuot.sext(OffsetType->getBitWidth()));
-  Value *IndexAdd = Builder.CreateAdd(Offset, ConstIndex);
-  if (Instruction *NewInst = dyn_cast<Instruction>(IndexAdd))
-    NewInsts.push_back(NewInst);
-  return IndexAdd;
+
+  return Result;
 }
 
 /// Promotes a single user of the alloca to a vector form.
 ///
 /// \param Inst           Instruction to be promoted.
 /// \param DL             Module Data Layout.
-/// \param VectorTy       Vectorized Type.
+/// \param AA             Alloca Analysis.
 /// \param VecStoreSize   Size of \p VectorTy in bytes.
 /// \param ElementSize    Size of \p VectorTy element type in bytes.
-/// \param TransferInfo   MemTransferInst info map.
-/// \param GEPVectorIdx   GEP -> VectorIdx cache.
 /// \param CurVal         Current value of the vector (e.g. last stored value)
 /// \param[out]  DeferredLoads \p Inst is added to this vector if it can't
 ///              be promoted now. This happens when promoting requires \p
 ///              CurVal, but \p CurVal is nullptr.
 /// \return the stored value if \p Inst would have written to the alloca, or
 ///         nullptr otherwise.
-static Value *promoteAllocaUserToVector(
-    Instruction *Inst, const DataLayout &DL, FixedVectorType *VectorTy,
-    unsigned VecStoreSize, unsigned ElementSize,
-    DenseMap<MemTransferInst *, MemTransferInfo> &TransferInfo,
-    std::map<GetElementPtrInst *, WeakTrackingVH> &GEPVectorIdx, Value *CurVal,
-    SmallVectorImpl<LoadInst *> &DeferredLoads) {
+static Value *promoteAllocaUserToVector(Instruction *Inst, const DataLayout &DL,
+                                        AllocaAnalysis &AA,
+                                        unsigned VecStoreSize,
+                                        unsigned ElementSize,
+                                        function_ref<Value *()> GetCurVal) {
   // Note: we use InstSimplifyFolder because it can leverage the DataLayout
   // to do more folding, especially in the case of vector splats.
   IRBuilder<InstSimplifyFolder> Builder(Inst->getContext(),
                                         InstSimplifyFolder(DL));
   Builder.SetInsertPoint(Inst);
 
-  const auto GetOrLoadCurrentVectorValue = [&]() -> Value * {
-    if (CurVal)
-      return CurVal;
-
-    // If the current value is not known, insert a dummy load and lower it on
-    // the second pass.
-    LoadInst *Dummy =
-        Builder.CreateLoad(VectorTy, PoisonValue::get(Builder.getPtrTy()),
-                           "promotealloca.dummyload");
-    DeferredLoads.push_back(Dummy);
-    return Dummy;
-  };
-
   const auto CreateTempPtrIntCast = [&Builder, DL](Value *Val,
                                                    Type *PtrTy) -> Value * {
     assert(DL.getTypeStoreSize(Val->getType()) == DL.getTypeStoreSize(PtrTy));
@@ -537,18 +616,13 @@ static Value *promoteAllocaUserToVector(
         Val, FixedVectorType::get(EltTy, NumPtrElts));
   };
 
-  Type *VecEltTy = VectorTy->getElementType();
+  Type *VecEltTy = AA.Vector.Ty->getElementType();
 
   switch (Inst->getOpcode()) {
   case Instruction::Load: {
-    // Loads can only be lowered if the value is known.
-    if (!CurVal) {
-      DeferredLoads.push_back(cast<LoadInst>(Inst));
-      return nullptr;
-    }
-
-    Value *Index = calculateVectorIndex(
-        cast<LoadInst>(Inst)->getPointerOperand(), GEPVectorIdx);
+    Value *CurVal = GetCurVal();
+    Value *Index =
+        calculateVectorIndex(cast<LoadInst>(Inst)->getPointerOperand(), AA);
 
     // We're loading the full vector.
     Type *AccessTy = Inst->getType();
@@ -604,7 +678,7 @@ static Value *promoteAllocaUserToVector(
     // to know the current value. If this is a store of a single element, we
     // need to know the value.
     StoreInst *SI = cast<StoreInst>(Inst);
-    Value *Index = calculateVectorIndex(SI->getPointerOperand(), GEPVectorIdx);
+    Value *Index = calculateVectorIndex(SI->getPointerOperand(), AA);
     Value *Val = SI->getValueOperand();
 
     // We're storing the full vector, we can handle this without knowing CurVal.
@@ -614,9 +688,9 @@ static Value *promoteAllocaUserToVector(
       if (CI->isZeroValue() && AccessSize == VecStoreSize) {
         if (AccessTy->isPtrOrPtrVectorTy())
           Val = CreateTempPtrIntCast(Val, AccessTy);
-        else if (VectorTy->isPtrOrPtrVectorTy())
-          Val = CreateTempPtrIntCast(Val, VectorTy);
-        return Builder.CreateBitOrPointerCast(Val, VectorTy);
+        else if (AA.Vector.Ty->isPtrOrPtrVectorTy())
+          Val = CreateTempPtrIntCast(Val, AA.Vector.Ty);
+        return Builder.CreateBitOrPointerCast(Val, AA.Vector.Ty);
       }
     }
 
@@ -625,7 +699,7 @@ static Value *promoteAllocaUserToVector(
       assert(AccessSize.isKnownMultipleOf(DL.getTypeStoreSize(VecEltTy)));
       const unsigned NumWrittenElts =
           AccessSize / DL.getTypeStoreSize(VecEltTy);
-      const unsigned NumVecElts = VectorTy->getNumElements();
+      const unsigned NumVecElts = AA.Vector.Ty->getNumElements();
       auto *SubVecTy = FixedVectorType::get(VecEltTy, NumWrittenElts);
       assert(DL.getTypeStoreSize(SubVecTy) == DL.getTypeStoreSize(AccessTy));
 
@@ -636,7 +710,7 @@ static Value *promoteAllocaUserToVector(
 
       Val = Builder.CreateBitOrPointerCast(Val, SubVecTy);
 
-      Value *CurVec = GetOrLoadCurrentVectorValue();
+      Value *CurVec = GetCurVal();
       for (unsigned K = 0, NumElts = std::min(NumWrittenElts, NumVecElts);
            K < NumElts; ++K) {
         Value *CurIdx =
@@ -649,22 +723,21 @@ static Value *promoteAllocaUserToVector(
 
     if (Val->getType() != VecEltTy)
       Val = Builder.CreateBitOrPointerCast(Val, VecEltTy);
-    return Builder.CreateInsertElement(GetOrLoadCurrentVectorValue(), Val,
-                                       Index);
+    return Builder.CreateInsertElement(GetCurVal(), Val, Index);
   }
   case Instruction::Call: {
     if (auto *MTI = dyn_cast<MemTransferInst>(Inst)) {
       // For memcpy, we need to know curval.
       ConstantInt *Length = cast<ConstantInt>(MTI->getLength());
       unsigned NumCopied = Length->getZExtValue() / ElementSize;
-      MemTransferInfo *TI = &TransferInfo[MTI];
+      MemTransferInfo *TI = &AA.Vector.TransferInfo[MTI];
       unsigned SrcBegin = TI->SrcIndex->getZExtValue();
       unsigned DestBegin = TI->DestIndex->getZExtValue();
 
       SmallVector<int> Mask;
-      for (unsigned Idx = 0; Idx < VectorTy->getNumElements(); ++Idx) {
+      for (unsigned Idx = 0; Idx < AA.Vector.Ty->getNumElements(); ++Idx) {
         if (Idx >= DestBegin && Idx < DestBegin + NumCopied) {
-          Mask.push_back(SrcBegin < VectorTy->getNumElements()
+          Mask.push_back(SrcBegin < AA.Vector.Ty->getNumElements()
                              ? SrcBegin++
                              : PoisonMaskElem);
         } else {
@@ -672,7 +745,7 @@ static Value *promoteAllocaUserToVector(
         }
       }
 
-      return Builder.CreateShuffleVector(GetOrLoadCurrentVectorValue(), Mask);
+      return Builder.CreateShuffleVector(GetCurVal(), Mask);
     }
 
     if (auto *MSI = dyn_cast<MemSetInst>(Inst)) {
@@ -693,14 +766,14 @@ static Value *promoteAllocaUserToVector(
           Elt = Builder.CreateBitCast(EltBytes, VecEltTy);
       }
 
-      return Builder.CreateVectorSplat(VectorTy->getElementCount(), Elt);
+      return Builder.CreateVectorSplat(AA.Vector.Ty->getElementCount(), Elt);
     }
 
     if (auto *Intr = dyn_cast<IntrinsicInst>(Inst)) {
       if (Intr->getIntrinsicID() == Intrinsic::objectsize) {
         Intr->replaceAllUsesWith(
             Builder.getIntN(Intr->getType()->getIntegerBitWidth(),
-                            DL.getTypeAllocSize(VectorTy)));
+                            DL.getTypeAllocSize(AA.Vector.Ty)));
         return nullptr;
       }
     }
@@ -791,16 +864,13 @@ static BasicBlock::iterator skipToNonAllocaInsertPt(BasicBlock &BB,
   return I;
 }
 
-// FIXME: Should try to pick the most likely to be profitable allocas first.
-bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
-  LLVM_DEBUG(dbgs() << "Trying to promote to vector: " << Alloca << '\n');
-
+FixedVectorType *
+AMDGPUPromoteAllocaImpl::getVectorTypeForAlloca(Type *AllocaTy) const {
   if (DisablePromoteAllocaToVector) {
-    LLVM_DEBUG(dbgs() << "  Promote alloca to vector is disabled\n");
-    return false;
+    LLVM_DEBUG(dbgs() << "  Promote alloca to vectors is disabled\n");
+    return nullptr;
   }
 
-  Type *AllocaTy = Alloca.getAllocatedType();
   auto *VectorTy = dyn_cast<FixedVectorType>(AllocaTy);
   if (auto *ArrayTy = dyn_cast<ArrayType>(AllocaTy)) {
     uint64_t NumElems = 1;
@@ -832,10 +902,9 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
       }
     }
   }
-
   if (!VectorTy) {
     LLVM_DEBUG(dbgs() << "  Cannot convert type to vector\n");
-    return false;
+    return nullptr;
   }
 
   const unsigned MaxElements =
@@ -845,46 +914,46 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
       VectorTy->getNumElements() < 2) {
     LLVM_DEBUG(dbgs() << "  " << *VectorTy
                       << " has an unsupported number of elements\n");
-    return false;
+    return nullptr;
   }
 
-  std::map<GetElementPtrInst *, WeakTrackingVH> GEPVectorIdx;
-  SmallVector<Instruction *> WorkList;
-  SmallVector<Instruction *> UsersToRemove;
-  SmallVector<Instruction *> DeferredInsts;
-  SmallVector<Instruction *> NewGEPInsts;
-  DenseMap<MemTransferInst *, MemTransferInfo> TransferInfo;
-
-  const auto RejectUser = [&](Instruction *Inst, Twine Msg) {
-    LLVM_DEBUG(dbgs() << "  Cannot promote alloca to vector: " << Msg << "\n"
-                      << "    " << *Inst << "\n");
-    for (auto *Inst : reverse(NewGEPInsts))
-      Inst->eraseFromParent();
-    return false;
-  };
-
-  SmallVector<Use *, 8> Uses;
-  collectAllocaUses(Alloca, Uses);
-
-  LLVM_DEBUG(dbgs() << "  Attempting promotion to: " << *VectorTy << "\n");
-
   Type *VecEltTy = VectorTy->getElementType();
   unsigned ElementSizeInBits = DL->getTypeSizeInBits(VecEltTy);
   if (ElementSizeInBits != DL->getTypeAllocSizeInBits(VecEltTy)) {
     LLVM_DEBUG(dbgs() << "  Cannot convert to vector if the allocation size "
                          "does not match the type's size\n");
-    return false;
+    return nullptr;
+  }
+
+  return VectorTy;
+}
+
+void AMDGPUPromoteAllocaImpl::analyzePromoteToVector(AllocaAnalysis &AA) const {
+  if (AA.HaveSelectOrPHI) {
+    LLVM_DEBUG(dbgs() << "  Cannot convert to vector due to select or phi\n");
+    return;
   }
-  unsigned ElementSize = ElementSizeInBits / 8;
+
+  Type *AllocaTy = AA.Alloca->getAllocatedType();
+  AA.Vector.Ty = getVectorTypeForAlloca(AllocaTy);
+  if (!AA.Vector.Ty)
+    return;
+
+  const auto RejectUser = [&](Instruction *Inst, Twine Msg) {
+    LLVM_DEBUG(dbgs() << "  Cannot promote alloca to vector: " << Msg << "\n"
+                      << "    " << *Inst << "\n");
+    AA.Vector.Ty = nullptr;
+  };
+
+  Type *VecEltTy = AA.Vector.Ty->getElementType();
+  unsigned ElementSize = DL->getTypeSizeInBits(VecEltTy) / 8;
   assert(ElementSize > 0);
-  for (auto *U : Uses) {
+  for (auto *U : AA.Uses) {
     Instruction *Inst = cast<Instruction>(U->getUser());
 
     if (Value *Ptr = getLoadStorePointerOperand(Inst)) {
-      // This is a store of the pointer, not to the pointer.
-      if (isa<StoreInst>(Inst) &&
-          U->getOperandNo() != StoreInst::getPointerOperandIndex())
-        return RejectUser(Inst, "pointer is being stored");
+      assert(!isa<StoreInst>(Inst) ||
+             U->getOperandNo() == StoreInst::getPointerOperandIndex());
 
       Type *AccessTy = getLoadStoreType(Inst);
       if (AccessTy->isAggregateType())
@@ -900,34 +969,35 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
       Ptr = Ptr->stripPointerCasts();
 
       // Alloca already accessed as vector.
-      if (Ptr == &Alloca && DL->getTypeStoreSize(Alloca.getAllocatedType()) ==
-                                DL->getTypeStoreSize(AccessTy)) {
-        WorkList.push_back(Inst);
+      if (Ptr == AA.Alloca &&
+          DL->getTypeStoreSize(AA.Alloca->getAllocatedType()) ==
+              DL->getTypeStoreSize(AccessTy)) {
+        AA.Vector.Worklist.push_back(Inst);
         continue;
       }
 
-      if (!isSupportedAccessType(VectorTy, AccessTy, *DL))
+      if (!isSupportedAccessType(AA.Vector.Ty, AccessTy, *DL))
         return RejectUser(Inst, "not a supported access type");
 
-      WorkList.push_back(Inst);
+      AA.Vector.Worklist.push_back(Inst);
       continue;
     }
 
     if (auto *GEP = dyn_cast<GetElementPtrInst>(Inst)) {
       // If we can't compute a vector index from this GEP, then we can't
       // promote this alloca to vector.
-      Value *Index = GEPToVectorIndex(GEP, &Alloca, VecEltTy, *DL, NewGEPInsts);
+      auto Index = computeGEPToVectorIndex(GEP, AA.Alloca, VecEltTy, *DL);
       if (!Index)
         return RejectUser(Inst, "cannot compute vector index for GEP");
 
-      GEPVectorIdx[GEP] = Index;
-      UsersToRemove.push_back(Inst);
+      AA.Vector.GEPVectorIdx[GEP] = std::move(Index.value());
+      AA.Vector.UsersToRemove.push_back(Inst);
       continue;
     }
 
     if (MemSetInst *MSI = dyn_cast<MemSetInst>(Inst);
-        MSI && isSupportedMemset(MSI, &Alloca, *DL)) {
-      WorkList.push_back(Inst);
+        MSI && isSupportedMemset(MSI, AA.Alloca, *DL)) {
+      AA.Vector.Worklist.push_back(Inst);
       continue;
     }
 
@@ -940,21 +1010,22 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
         return RejectUser(Inst, "mem transfer inst length is non-constant or "
                                 "not a multiple of the vector element size");
 
-      if (TransferInfo.try_emplace(TransferInst).second) {
-        DeferredInsts.push_back(Inst);
-        WorkList.push_back(Inst);
-      }
-
       auto getPointerIndexOfAlloca = [&](Value *Ptr) -> ConstantInt * {
-        GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
-        if (Ptr != &Alloca && !GEPVectorIdx.count(GEP))
-          return nullptr;
+        if (Ptr == AA.Alloca)
+          return ConstantInt::get(Ptr->getContext(), APInt(32, 0));
 
-        return dyn_cast<ConstantInt>(calculateVectorIndex(Ptr, GEPVectorIdx));
+        GetElementPtrInst *GEP = cast<GetElementPtrInst>(Ptr);
+        const auto &GEPI = AA.Vector.GEPVectorIdx.find(GEP)->second;
+        if (GEPI.VarIndex)
+          return nullptr;
+        if (GEPI.ConstIndex)
+          return GEPI.ConstIndex;
+        return ConstantInt::get(Ptr->getContext(), APInt(32, 0));
       };
 
+      MemTransferInfo *TI =
+          &AA.Vector.TransferInfo.try_emplace(TransferInst).first->second;
       unsigned OpNum = U->getOperandNo();
-      MemTransferInfo *TI = &TransferInfo[TransferInst];
       if (OpNum == 0) {
         Value *Dest = TransferInst->getDest();
         ConstantInt *Index = getPointerIndexOfAlloca(Dest);
@@ -974,7 +1045,7 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
 
     if (auto *Intr = dyn_cast<IntrinsicInst>(Inst)) {
       if (Intr->getIntrinsicID() == Intrinsic::objectsize) {
-        WorkList.push_back(Inst);
+        AA.Vector.Worklist.push_back(Inst);
         continue;
       }
     }
@@ -983,97 +1054,106 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
     if (isAssumeLikeIntrinsic(Inst)) {
       if (!Inst->use_empty())
         return RejectUser(Inst, "assume-like intrinsic cannot have any users");
-      UsersToRemove.push_back(Inst);
+      AA.Vector.UsersToRemove.push_back(Inst);
       continue;
     }
 
     if (isa<ICmpInst>(Inst) && all_of(Inst->users(), [](User *U) {
           return isAssumeLikeIntrinsic(cast<Instruction>(U));
         })) {
-      UsersToRemove.push_back(Inst);
+      AA.Vector.UsersToRemove.push_back(Inst);
       continue;
     }
 
     return RejectUser(Inst, "unhandled alloca user");
   }
 
-  while (!DeferredInsts.empty()) {
-    Instruction *Inst = DeferredInsts.pop_back_val();
-    MemTransferInst *TransferInst = cast<MemTransferInst>(Inst);
-    // TODO: Support the case if the pointers are from different alloca or
-    // from different address spaces.
-    MemTransferInfo &Info = TransferInfo[TransferInst];
-    if (!Info.SrcIndex || !Info.DestIndex)
-      return RejectUser(
-          Inst, "mem transfer inst is missing constant src and/or dst index");
+  // Follow-up check to ensure we've seen both sides of all transfer insts.
+  for (const auto &Entry : AA.Vector.TransferInfo) {
+    const MemTransferInfo &TI = Entry.second;
+    if (!TI.SrcIndex || !TI.DestIndex)
+      return RejectUser(Entry.first,
+                        "mem transfer inst between different objects");
+    AA.Vector.Worklist.push_back(Entry.first);
   }
+}
+
+void AMDGPUPromoteAllocaImpl::promoteAllocaToVector(AllocaAnalysis &AA) {
+  LLVM_DEBUG(dbgs() << "Promoting to vectors: " << *AA.Alloca << '\n');
+  LLVM_DEBUG(dbgs() << "  type conversion: " << *AA.Alloca->getAllocatedType()
+                    << " -> " << *AA.Vector.Ty << '\n');
+  const unsigned VecStoreSize = DL->getTypeStoreSize(AA.Vector.Ty);
 
-  LLVM_DEBUG(dbgs() << "  Converting alloca to vector " << *AllocaTy << " -> "
-                    << *VectorTy << '\n');
-  const unsigned VecStoreSize = DL->getTypeStoreSize(VectorTy);
+  Type *VecEltTy = AA.Vector.Ty->getElementType();
+  const unsigned ElementSize = DL->getTypeSizeInBits(VecEltTy) / 8;
 
   // Alloca is uninitialized memory. Imitate that by making the first value
   // undef.
   SSAUpdater Updater;
-  Updater.Initialize(VectorTy, "promotealloca");
+  Updater.Initialize(AA.Vector.Ty, "promotealloca");
 
-  BasicBlock *EntryBB = Alloca.getParent();
+  BasicBlock *EntryBB = AA.Alloca->getParent();
   BasicBlock::iterator InitInsertPos =
-      skipToNonAllocaInsertPt(*EntryBB, Alloca.getIterator());
-  // Alloca memory is undefined to begin, not poison.
-  Value *AllocaInitValue =
-      new FreezeInst(PoisonValue::get(VectorTy), "", InitInsertPos);
-  AllocaInitValue->takeName(&Alloca);
+      skipToNonAllocaInsertPt(*EntryBB, AA.Alloca->getIterator());
+  IRBuilder<> Builder(&*InitInsertPos);
+  Value *AllocaInitValue = Builder.CreateFreeze(PoisonValue::get(AA.Vector.Ty));
+  AllocaInitValue->takeName(AA.Alloca);
 
-  Updater.AddAvailableValue(EntryBB, AllocaInitValue);
+  Updater.AddAvailableValue(AA.Alloca->getParent(), AllocaInitValue);
 
-  // First handle the initial worklist.
-  SmallVector<LoadInst *, 4> DeferredLoads;
-  forEachWorkListItem(WorkList, [&](Instruction *I) {
+  // First handle the initial worklist, in basic block order.
+  //
+  // Insert a placeholder whenever we need the vector value at the top of a
+  // basic block.
+  SmallVector<Instruction *> Placeholders;
+  forEachWorkListItem(AA.Vector.Worklist, [&](Instruction *I) {
     BasicBlock *BB = I->getParent();
-    // On the first pass, we only take values that are trivially known, i.e.
-    // where AddAvailableValue was already called in this block.
-    Value *Result = promoteAllocaUserToVector(
-        I, *DL, VectorTy, VecStoreSize, ElementSize, TransferInfo, GEPVectorIdx,
-        Updater.FindValueForBlock(BB), DeferredLoads);
+    auto GetCurVal = [&]() -> Value * {
+      if (Value *CurVal = Updater.FindValueForBlock(BB))
+        return CurVal;
+
+      // If the current value in the basic block is not yet known, insert a
+      // placeholder that we will replace later.
+      IRBuilder<> Builder(I);
+      auto *Placeholder = cast<Instruction>(Builder.CreateFreeze(
+          PoisonValue::get(AA.Vector.Ty), "promotealloca.placeholder"));
+      Placeholders.push_back(Placeholder);
+      Updater.AddAvailableValue(BB, Placeholder);
+      return Placeholder;
+    };
+
+    Value *Result = promoteAllocaUserToVector(I, *DL, AA, VecStoreSize,
+                                              ElementSize, GetCurVal);
     if (Result)
       Updater.AddAvailableValue(BB, Result);
   });
 
-  // Then handle deferred loads.
-  forEachWorkListItem(DeferredLoads, [&](Instruction *I) {
-    SmallVector<LoadInst *, 0> NewDLs;
-    BasicBlock *BB = I->getParent();
-    // On the second pass, we use GetValueInMiddleOfBlock to guarantee we always
-    // get a value, inserting PHIs as needed.
-    Value *Result = promoteAllocaUserToVector(
-        I, *DL, VectorTy, VecStoreSize, ElementSize, TransferInfo, GEPVectorIdx,
-        Updater.GetValueInMiddleOfBlock(I->getParent()), NewDLs);
-    if (Result)
-      Updater.AddAvailableValue(BB, Result);
-    assert(NewDLs.empty() && "No more deferred loads should be queued!");
-  });
+  // Now fixup the placeholders.
+  for (Instruction *Placeholder : Placeholders) {
+    Placeholder->replaceAllUsesWith(
+        Updater.GetValueInMiddleOfBlock(Placeholder->getParent()));
+    Placeholder->eraseFromParent();
+  }
 
   // Delete all instructions. On the first pass, new dummy loads may have been
   // added so we need to collect them too.
-  DenseSet<Instruction *> InstsToDelete(WorkList.begin(), WorkList.end());
-  InstsToDelete.insert_range(DeferredLoads);
+  DenseSet<Instruction *> InstsToDelete(AA.Vector.Worklist.begin(),
+                                        AA.Vector.Worklist.end());
   for (Instruction *I : InstsToDelete) {
     assert(I->use_empty());
     I->eraseFromParent();
   }
 
   // Delete all the users that are known to be removeable.
-  for (Instruction *I : reverse(UsersToRemove)) {
+  for (Instruction *I : reverse(AA.Vector.UsersToRemove)) {
     I->dropDroppableUses();
     assert(I->use_empty());
     I->eraseFromParent();
   }
 
   // Alloca should now be dead too.
-  assert(Alloca.use_empty());
-  Alloca.eraseFromParent();
-  return true;
+  assert(AA.Alloca->use_empty());
+  AA.Alloca->eraseFromParent();
 }
 
 std::pair<Value *, Value *>
@@ -1247,61 +1327,78 @@ bool AMDGPUPromoteAllocaImpl::binaryOpIsDerivedFromSameAlloca(
   return true;
 }
 
-bool AMDGPUPromoteAllocaImpl::collectUsesWithPtrTypes(
-    Value *BaseAlloca, Value *Val, std::vector<Value *> &WorkList) const {
+void AMDGPUPromoteAllocaImpl::analyzePromoteToLDS(AllocaAnalysis &AA) const {
+  if (DisablePromoteAllocaToLDS) {
+    LLVM_DEBUG(dbgs() << "  Promote alloca to LDS is disabled\n");
+    return;
+  }
 
-  for (User *User : Val->users()) {
-    if (is_contained(WorkList, User))
-      continue;
+  // Don't promote the alloca to LDS for shader calling conventions as the work
+  // item ID intrinsics are not supported for these calling conventions.
+  // Furthermore not all LDS is available for some of the stages.
+  const Function &ContainingFunction = *AA.Alloca->getFunction();
+  CallingConv::ID CC = ContainingFunction.getCallingConv();
+
+  switch (CC) {
+  case CallingConv::AMDGPU_KERNEL:
+  case CallingConv::SPIR_KERNEL:
+    break;
+  default:
+    LLVM_DEBUG(
+        dbgs()
+        << "  promote alloca to LDS not supported with calling convention.\n");
+    return;
+  }
+
+  for (Use *Use : AA.Uses) {
+    auto *User = Use->getUser();
 
     if (CallInst *CI = dyn_cast<CallInst>(User)) {
       if (!isCallPromotable(CI))
-        return false;
+        return;
 
-      WorkList.push_back(User);
+      if (find(AA.LDS.Worklist, User) == AA.LDS.Worklist.end())
+        AA.LDS.Worklist.push_back(User);
       continue;
     }
 
     Instruction *UseInst = cast<Instruction>(User);
     if (UseInst->getOpcode() == Instruction::PtrToInt)
-      return false;
+      return;
 
     if (LoadInst *LI = dyn_cast<LoadInst>(UseInst)) {
       if (LI->isVolatile())
-        return false;
+        return;
       continue;
     }
 
     if (StoreInst *SI = dyn_cast<StoreInst>(UseInst)) {
       if (SI->isVolatile())
-        return false;
-
-      // Reject if the stored value is not the pointer operand.
-      if (SI->getPointerOperand() != Val)
-        return false;
+        return;
       continue;
     }
 
     if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(UseInst)) {
       if (RMW->isVolatile())
-        return false;
+        return;
       continue;
     }
 
     if (AtomicCmpXchgInst *CAS = dyn_cast<AtomicCmpXchgInst>(UseInst)) {
       if (CAS->isVolatile())
-        return false;
+        return;
       continue;
     }
 
     // Only promote a select if we know that the other select operand
     // is from another pointer that will also be promoted.
     if (ICmpInst *ICmp = dyn_cast<ICmpInst>(UseInst)) {
-      if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, ICmp, 0, 1))
-        return false;
+      if (!binaryOpIsDerivedFromSameAlloca(AA.Alloca, Use->get(), ICmp, 0, 1))
+        return;
 
       // May need to rewrite constant operands.
-      WorkList.push_back(ICmp);
+      if (find(AA.LDS.Worklist, User) == AA.LDS.Worklist.end())
+        AA.LDS.Worklist.push_back(ICmp);
       continue;
     }
 
@@ -1309,28 +1406,8 @@ bool AMDGPUPromoteAllocaImpl::collectUsesWithPtrTypes(
       // Be conservative if an address could be computed outside the bounds of
       // the alloca.
       if (!GEP->isInBounds())
-        return false;
-    } else if (SelectInst *SI = dyn_cast<SelectInst>(UseInst)) {
-      // Only promote a select if we know that the other select operand is from
-      // another pointer that will also be promoted.
-      if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, SI, 1, 2))
-        return false;
-    } else if (PHINode *Phi = dyn_cast<PHINode>(UseInst)) {
-      // Repeat for phis.
-
-      // TODO: Handle more complex cases. We should be able to replace loops
-      // over arrays.
-      switch (Phi->getNumIncomingValues()) {
-      case 1:
-        break;
-      case 2:
-        if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, Phi, 0, 1))
-          return false;
-        break;
-      default:
-        return false;
-      }
-    } else if (!isa<ExtractElementInst>(User)) {
+        return;
+    } else if (!isa<ExtractElementInst, SelectInst, PHINode>(User)) {
       // Do not promote vector/aggregate type instructions. It is hard to track
       // their users.
 
@@ -1338,15 +1415,14 @@ bool AMDGPUPromoteAllocaImpl::collectUsesWithPtrTypes(
       //
       // TODO: If we know the address is only observed through flat pointers, we
       // could still promote.
-      return false;
+      return;
     }
 
-    WorkList.push_back(User);
-    if (!collectUsesWithPtrTypes(BaseAlloca, User, WorkList))
-      return false;
+    if (find(AA.LDS.Worklist, User) == AA.LDS.Worklist.end())
+      AA.LDS.Worklist.push_back(User);
   }
 
-  return true;
+  AA.LDS.Enable = true;
 }
 
 bool AMDGPUPromoteAllocaImpl::hasSufficientLocalMem(const Function &F) {
@@ -1477,44 +1553,23 @@ bool AMDGPUPromoteAllocaImpl::hasSufficientLocalMem(const Function &F) {
 }
 
 // FIXME: Should try to pick the most likely to be profitable allocas first.
-bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToLDS(AllocaInst &I,
+bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToLDS(AllocaAnalysis &AA,
                                                     bool SufficientLDS) {
-  LLVM_DEBUG(dbgs() << "Trying to promote to LDS: " << I << '\n');
-
-  if (DisablePromoteAllocaToLDS) {
-    LLVM_DEBUG(dbgs() << "  Promote alloca to LDS is disabled\n");
-    return false;
-  }
-
-  const DataLayout &DL = Mod->getDataLayout();
-  IRBuilder<> Builder(&I);
-
-  const Function &ContainingFunction = *I.getFunction();
-  CallingConv::ID CC = ContainingFunction.getCallingConv();
-
-  // Don't promote the alloca to LDS for shader calling conventions as the work
-  // item ID intrinsics are not supported for these calling conventions.
-  // Furthermore not all LDS is available for some of the stages.
-  switch (CC) {
-  case CallingConv::AMDGPU_KERNEL:
-  case CallingConv::SPIR_KERNEL:
-    break;
-  default:
-    LLVM_DEBUG(
-        dbgs()
-        << " promote alloca to LDS not supported with calling convention.\n");
-    return false;
-  }
+  LLVM_DEBUG(dbgs() << "Trying to promote to LDS: " << *AA.Alloca << '\n');
 
   // Not likely to have sufficient local memory for promotion.
   if (!SufficientLDS)
     return false;
 
+  const DataLayout &DL = Mod->getDataLayout();
+  IRBuilder<> Builder(AA.Alloca);
+
+  const Function &ContainingFunction = *AA.Alloca->getParent()->getParent();
   const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, ContainingFunction);
   unsigned WorkGroupSize = ST.getFlatWorkGroupSizes(ContainingFunction).second;
 
-  Align Alignment =
-      DL.getValueOrABITypeAlignment(I.getAlign(), I.getAllocatedType());
+  Align Alignment = DL.getValueOrABITypeAlignment(
+      AA.Alloca->getAlign(), AA.Alloca->getAllocatedType());
 
   // FIXME: This computed padding is likely wrong since it depends on inverse
   // usage order.
@@ -1524,7 +1579,7 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToLDS(AllocaInst &I,
 
   uint32_t NewSize = alignTo(CurrentLocalMemUsage, Alignment);
   uint32_t AllocSize =
-      WorkGroupSize * DL.getTypeAllocSize(I.getAllocatedType());
+      WorkGroupSize * DL.getTypeAllocSize(AA.Alloca->getAllocatedType());
   NewSize += AllocSize;
 
   if (NewSize > LocalMemLimit) {
@@ -1535,24 +1590,17 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToLDS(AllocaInst &I,
 
   CurrentLocalMemUsage = NewSize;
 
-  std::vector<Value *> WorkList;
-
-  if (!collectUsesWithPtrTypes(&I, &I, WorkList)) {
-    LLVM_DEBUG(dbgs() << " Do not know how to convert all uses\n");
-    return false;
-  }
-
   LLVM_DEBUG(dbgs() << "Promoting alloca to local memory\n");
 
-  Function *F = I.getFunction();
+  Function *F = AA.Alloca->getFunction();
 
-  Type *GVTy = ArrayType::get(I.getAllocatedType(), WorkGroupSize);
+  Type *GVTy = ArrayType::get(AA.Alloca->getAllocatedType(), WorkGroupSize);
   GlobalVariable *GV = new GlobalVariable(
       *Mod, GVTy, false, GlobalValue::InternalLinkage, PoisonValue::get(GVTy),
-      Twine(F->getName()) + Twine('.') + I.getName(), nullptr,
+      Twine(F->getName()) + Twine('.') + AA.Alloca->getName(), nullptr,
       GlobalVariable::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS);
   GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
-  GV->setAlignment(I.getAlign());
+  GV->setAlignment(AA.Alloca->getAlign());
 
   Value *TCntY, *TCntZ;
 
@@ -1571,15 +1619,15 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToLDS(AllocaInst &I,
   Value *Indices[] = {Constant::getNullValue(Type::getInt32Ty(Context)), TID};
 
   Value *Offset = Builder.CreateInBoundsGEP(GVTy, GV, Indices);
-  I.mutateType(Offset->getType());
-  I.replaceAllUsesWith(Offset);
-  I.eraseFromParent();
+  AA.Alloca->mutateType(Offset->getType());
+  AA.Alloca->replaceAllUsesWith(Offset);
+  AA.Alloca->eraseFromParent();
 
   SmallVector<IntrinsicInst *> DeferredIntrs;
 
   PointerType *NewPtrTy = PointerType::get(Context, AMDGPUAS::LOCAL_ADDRESS);
 
-  for (Value *V : WorkList) {
+  for (Value *V : AA.LDS.Worklist) {
     CallInst *Call = dyn_cast<CallInst>(V);
     if (!Call) {
       if (ICmpInst *CI = dyn_cast<ICmpInst>(V)) {
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-multidim.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-multidim.ll
index 63622e67e7d0b..7b64d8728cc24 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-multidim.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-multidim.ll
@@ -262,14 +262,15 @@ define amdgpu_kernel void @i64_2d_load_store_subvec_3_i64_offset(ptr %out) {
 ; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <6 x i64> [[TMP12]], i64 3, i32 3
 ; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <6 x i64> [[TMP13]], i64 4, i32 4
 ; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <6 x i64> [[TMP14]], i64 5, i32 5
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[SEL3]], 3
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <6 x i64> [[TMP15]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = trunc i64 [[SEL3]] to i32
+; CHECK-NEXT:    [[TMP16:%.*]] = mul i32 [[TMP7]], 3
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <6 x i64> [[TMP15]], i32 [[TMP16]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <3 x i64> poison, i64 [[TMP2]], i64 0
-; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[TMP1]], 1
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <6 x i64> [[TMP15]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP17:%.*]] = add i32 [[TMP16]], 1
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <6 x i64> [[TMP15]], i32 [[TMP17]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <3 x i64> [[TMP3]], i64 [[TMP5]], i64 1
-; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[TMP1]], 2
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <6 x i64> [[TMP15]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP18:%.*]] = add i32 [[TMP16]], 2
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <6 x i64> [[TMP15]], i32 [[TMP18]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <3 x i64> [[TMP6]], i64 [[TMP8]], i64 2
 ; CHECK-NEXT:    [[ELEM:%.*]] = extractelement <3 x i64> [[TMP9]], i32 2
 ; CHECK-NEXT:    store i64 [[ELEM]], ptr [[OUT]], align 8
@@ -311,15 +312,16 @@ define amdgpu_kernel void @i64_2d_load_store_subvec_3_i64_offset_index(ptr %out)
 ; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <6 x i64> [[TMP13]], i64 3, i32 3
 ; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <6 x i64> [[TMP14]], i64 4, i32 4
 ; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <6 x i64> [[TMP15]], i64 5, i32 5
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[SEL3]], 3
-; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[TMP1]], 6
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <6 x i64> [[TMP16]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP17:%.*]] = trunc i64 [[SEL3]] to i32
+; CHECK-NEXT:    [[TMP8:%.*]] = mul i32 [[TMP17]], 3
+; CHECK-NEXT:    [[TMP18:%.*]] = add i32 [[TMP8]], 6
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <6 x i64> [[TMP16]], i32 [[TMP18]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <3 x i64> poison, i64 [[TMP3]], i64 0
-; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[TMP2]], 1
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <6 x i64> [[TMP16]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP19:%.*]] = add i32 [[TMP18]], 1
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <6 x i64> [[TMP16]], i32 [[TMP19]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <3 x i64> [[TMP4]], i64 [[TMP6]], i64 1
-; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[TMP2]], 2
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <6 x i64> [[TMP16]], i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP20:%.*]] = add i32 [[TMP18]], 2
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <6 x i64> [[TMP16]], i32 [[TMP20]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <3 x i64> [[TMP7]], i64 [[TMP9]], i64 2
 ; CHECK-NEXT:    [[ELEM:%.*]] = extractelement <3 x i64> [[TMP10]], i32 2
 ; CHECK-NEXT:    store i64 [[ELEM]], ptr [[OUT]], align 8
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-negative-index.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-negative-index.ll
index a865bf5058d6a..7ebb4ca262614 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-negative-index.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-negative-index.ll
@@ -11,8 +11,9 @@ define amdgpu_kernel void @negative_index_byte(ptr %out, i64 %offset) {
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i8> [[TMP1]], i8 1, i32 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i8> [[TMP2]], i8 2, i32 2
 ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i8> [[TMP3]], i8 3, i32 3
-; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[OFFSET:%.*]], -1
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i8> [[TMP4]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP5:%.*]] = trunc i64 [[OFFSET:%.*]] to i32
+; CHECK-NEXT:    [[TMP7:%.*]] = add i32 [[TMP5]], -1
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i8> [[TMP4]], i32 [[TMP7]]
 ; CHECK-NEXT:    store i8 [[TMP6]], ptr [[OUT:%.*]], align 1
 ; CHECK-NEXT:    ret void
 ;
@@ -39,8 +40,9 @@ define amdgpu_kernel void @negative_index_word(ptr %out, i64 %offset) {
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 1, i32 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 2, i32 2
 ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 3, i32 3
-; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[OFFSET:%.*]], -1
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP4]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP5:%.*]] = trunc i64 [[OFFSET:%.*]] to i32
+; CHECK-NEXT:    [[TMP7:%.*]] = add i32 [[TMP5]], -1
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP4]], i32 [[TMP7]]
 ; CHECK-NEXT:    store i32 [[TMP6]], ptr [[OUT:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-scoring.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-scoring.ll
index ab03177d1edc5..ae6157af2cf4c 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-scoring.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-scoring.ll
@@ -1,14 +1,16 @@
 ; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -debug-only=amdgpu-promote-alloca -amdgpu-promote-alloca-to-vector-limit=512 -passes=amdgpu-promote-alloca %s -o - 2>&1 | FileCheck %s
 ; REQUIRES: asserts
 
-; CHECK:      Scoring:   %simpleuser = alloca [4 x i64], align 4, addrspace(5)
-; CHECK-NEXT:   [+1]:   store i32 42, ptr addrspace(5) %simpleuser, align 4
+; CHECK-LABEL: Analyzing:   %simpleuser = alloca [4 x i64], align 4, addrspace(5)
+; CHECK-NEXT: Scoring:   %simpleuser = alloca [4 x i64], align 4, addrspace(5)
+; CHECK-NEXT:   [+1]:   store i64 42, ptr addrspace(5) %simpleuser, align 8
 ; CHECK-NEXT:   => Final Score:1
+; CHECK-LABEL: Analyzing:   %manyusers = alloca [4 x i64], align 4, addrspace(5)
 ; CHECK-NEXT: Scoring:   %manyusers = alloca [4 x i64], align 4, addrspace(5)
-; CHECK-NEXT:   [+1]:   store i32 %v0.ext, ptr addrspace(5) %manyusers.1, align 4
-; CHECK-NEXT:   [+1]:   %v0 = load i8, ptr addrspace(5) %manyusers.1, align 1
-; CHECK-NEXT:   [+1]:   store i32 %v1.ext, ptr addrspace(5) %manyusers.2, align 4
-; CHECK-NEXT:   [+1]:   %v1 = load i8, ptr addrspace(5) %manyusers.2, align 1
+; CHECK-NEXT:   [+1]:   store i64 %v0.add, ptr addrspace(5) %manyusers.1, align 8
+; CHECK-NEXT:   [+1]:   %v0 = load i64, ptr addrspace(5) %manyusers.1, align 8
+; CHECK-NEXT:   [+1]:   store i64 %v1.add, ptr addrspace(5) %manyusers.2, align 8
+; CHECK-NEXT:   [+1]:   %v1 = load i64, ptr addrspace(5) %manyusers.2, align 8
 ; CHECK-NEXT:   => Final Score:4
 ; CHECK-NEXT: Sorted Worklist:
 ; CHECK-NEXT:     %manyusers = alloca [4 x i64], align 4, addrspace(5)
@@ -20,50 +22,52 @@ entry:
   ; should get a score of 4
   %manyusers = alloca [4 x i64], align 4, addrspace(5)
 
-  store i32 42, ptr addrspace(5) %simpleuser
+  store i64 42, ptr addrspace(5) %simpleuser
 
-  %manyusers.1 = getelementptr i8, ptr addrspace(5) %manyusers, i64 2
-  %v0 = load i8, ptr addrspace(5)  %manyusers.1
-  %v0.ext = zext i8 %v0 to i32
-  store i32 %v0.ext, ptr addrspace(5) %manyusers.1
+  %manyusers.1 = getelementptr i64, ptr addrspace(5) %manyusers, i64 2
+  %v0 = load i64, ptr addrspace(5)  %manyusers.1
+  %v0.add = add i64 %v0, 1
+  store i64 %v0.add, ptr addrspace(5) %manyusers.1
 
-  %manyusers.2 = getelementptr i8, ptr addrspace(5) %manyusers, i64 1
-  %v1 = load i8, ptr addrspace(5)  %manyusers.2
-  %v1.ext = zext i8 %v0 to i32
-  store i32 %v1.ext, ptr addrspace(5) %manyusers.2
+  %manyusers.2 = getelementptr i64, ptr addrspace(5) %manyusers, i64 1
+  %v1 = load i64, ptr addrspace(5)  %manyusers.2
+  %v1.add = add i64 %v0, 1
+  store i64 %v1.add, ptr addrspace(5) %manyusers.2
 
   ret void
 }
 
-; CHECK:      Scoring:   %stack = alloca [4 x i64], align 4, addrspace(5)
-; CHECK-NEXT:   [+5]:   store i32 32, ptr addrspace(5) %stack, align 4
-; CHECK-NEXT:   [+1]:   store i32 42, ptr addrspace(5) %stack, align 4
-; CHECK-NEXT:   [+9]:   store i32 32, ptr addrspace(5) %stack.1, align 4
-; CHECK-NEXT:   [+5]:   %outer.cmp = load i1, ptr addrspace(5) %stack.1, align 1
-; CHECK-NEXT:   [+1]:   store i32 64, ptr addrspace(5) %stack.2, align 4
-; CHECK-NEXT:   [+9]:   %inner.cmp = load i1, ptr addrspace(5) %stack.2, align 1
+; CHECK-LABEL: Analyzing:   %stack = alloca [4 x i64], align 4, addrspace(5)
+; CHECK-NEXT: Scoring:   %stack = alloca [4 x i64], align 4, addrspace(5)
+; CHECK-NEXT:   [+5]:   store i64 32, ptr addrspace(5) %stack, align 8
+; CHECK-NEXT:   [+1]:   store i64 42, ptr addrspace(5) %stack, align 8
+; CHECK-NEXT:   [+9]:   store i64 32, ptr addrspace(5) %stack.1, align 8
+; CHECK-NEXT:   [+5]:   %outer = load i64, ptr addrspace(5) %stack.1, align 8
+; CHECK-NEXT:   [+1]:   store i64 64, ptr addrspace(5) %stack.2, align 8
+; CHECK-NEXT:   [+9]:   %inner = load i64, ptr addrspace(5) %stack.2, align 8
 ; CHECK-NEXT:   => Final Score:30
 define amdgpu_kernel void @loop_users_alloca(i1 %x, i2) #0 {
 entry:
   ; should get a score of 1
   %stack = alloca [4 x i64], align 4, addrspace(5)
-  %stack.1 = getelementptr i8, ptr addrspace(5) %stack, i64 4
-  %stack.2 = getelementptr i8, ptr addrspace(5) %stack, i64 8
+  %stack.1 = getelementptr i8, ptr addrspace(5) %stack, i64 8
+  %stack.2 = getelementptr i8, ptr addrspace(5) %stack, i64 16
 
-  store i32 42, ptr addrspace(5) %stack
+  store i64 42, ptr addrspace(5) %stack
   br label %loop.outer
 
 loop.outer:
-  store i32 32, ptr addrspace(5) %stack
-  %outer.cmp = load i1, ptr addrspace(5) %stack.1
+  store i64 32, ptr addrspace(5) %stack
+  %outer = load i64, ptr addrspace(5) %stack.1
   br label %loop.inner
 
 loop.inner:
-  store i32 32, ptr addrspace(5) %stack.1
-  %inner.cmp = load i1, ptr addrspace(5) %stack.2
+  store i64 32, ptr addrspace(5) %stack.1
+  %inner = load i64, ptr addrspace(5) %stack.2
+  %inner.cmp = icmp sge i64 %inner, 0
   br i1 %inner.cmp, label %loop.inner, label %loop.outer
 
 exit:
-  store i32 64, ptr addrspace(5) %stack.2
+  store i64 64, ptr addrspace(5) %stack.2
   ret void
 }
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-to-vector.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-to-vector.ll
index 9fb73963153a2..aaec725f85890 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-to-vector.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-to-vector.ll
@@ -1,6 +1,6 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
-; RUN: opt -S -mtriple=amdgcn-- -data-layout=A5 -mcpu=fiji -passes=sroa,amdgpu-promote-alloca < %s | FileCheck -check-prefix=OPT %s
+; RUN: opt -S -mtriple=amdgcn-- -mcpu=fiji -passes=sroa,amdgpu-promote-alloca < %s | FileCheck -check-prefix=OPT %s
 
 ; GCN-LABEL: {{^}}float4_alloca_store4:
 ; OPT-LABEL: define amdgpu_kernel void @float4_alloca_store4
diff --git a/llvm/test/CodeGen/AMDGPU/vector-alloca-atomic.ll b/llvm/test/CodeGen/AMDGPU/vector-alloca-atomic.ll
index 8e4cc2b0236c0..a7090960518af 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-alloca-atomic.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-alloca-atomic.ll
@@ -1,11 +1,11 @@
-; RUN: opt -S -mtriple=amdgcn-- -data-layout=A5 -passes='amdgpu-promote-alloca,sroa,instcombine' < %s | FileCheck -check-prefix=OPT %s
+; RUN: opt -S -mtriple=amdgcn-- -passes='amdgpu-promote-alloca,sroa,instcombine' < %s | FileCheck -check-prefix=OPT %s
 
 ; Show that what the alloca promotion pass will do for non-atomic load/store.
 
 ; OPT-LABEL: @vector_alloca_not_atomic(
 ;
-; OPT: extractelement <3 x i32> <i32 0, i32 1, i32 2>, i64 %index
-define amdgpu_kernel void @vector_alloca_not_atomic(ptr addrspace(1) %out, i64 %index) {
+; OPT: extractelement <3 x i32> <i32 0, i32 1, i32 2>, i32 %index
+define amdgpu_kernel void @vector_alloca_not_atomic(ptr addrspace(1) %out, i32 %index) {
 entry:
   %alloca = alloca [3 x i32], addrspace(5)
   %a1 = getelementptr [3 x i32], ptr addrspace(5) %alloca, i32 0, i32 1
@@ -13,7 +13,7 @@ entry:
   store i32 0, ptr addrspace(5) %alloca
   store i32 1, ptr addrspace(5) %a1
   store i32 2, ptr addrspace(5) %a2
-  %tmp = getelementptr [3 x i32], ptr addrspace(5) %alloca, i64 0, i64 %index
+  %tmp = getelementptr [3 x i32], ptr addrspace(5) %alloca, i32 0, i32 %index
   %data = load i32, ptr addrspace(5) %tmp
   store i32 %data, ptr addrspace(1) %out
   ret void
@@ -26,7 +26,7 @@ entry:
 ; OPT: store i32 1, ptr addrspace(5)
 ; OPT: store i32 2, ptr addrspace(5)
 ; OPT: load atomic i32, ptr addrspace(5)
-define amdgpu_kernel void @vector_alloca_atomic_read(ptr addrspace(1) %out, i64 %index) {
+define amdgpu_kernel void @vector_alloca_atomic_read(ptr addrspace(1) %out, i32 %index) {
 entry:
   %alloca = alloca [3 x i32], addrspace(5)
   %a1 = getelementptr [3 x i32], ptr addrspace(5) %alloca, i32 0, i32 1
@@ -34,7 +34,7 @@ entry:
   store i32 0, ptr addrspace(5) %alloca
   store i32 1, ptr addrspace(5) %a1
   store i32 2, ptr addrspace(5) %a2
-  %tmp = getelementptr [3 x i32], ptr addrspace(5) %alloca, i64 0, i64 %index
+  %tmp = getelementptr [3 x i32], ptr addrspace(5) %alloca, i32 0, i32 %index
   %data = load atomic i32, ptr addrspace(5) %tmp acquire, align 4
   store i32 %data, ptr addrspace(1) %out
   ret void
@@ -47,7 +47,7 @@ entry:
 ; OPT: store atomic i32 1, ptr addrspace(5)
 ; OPT: store atomic i32 2, ptr addrspace(5)
 ; OPT: load i32, ptr addrspace(5)
-define amdgpu_kernel void @vector_alloca_atomic_write(ptr addrspace(1) %out, i64 %index) {
+define amdgpu_kernel void @vector_alloca_atomic_write(ptr addrspace(1) %out, i32 %index) {
 entry:
   %alloca = alloca [3 x i32], addrspace(5)
   %a1 = getelementptr [3 x i32], ptr addrspace(5) %alloca, i32 0, i32 1
@@ -55,7 +55,7 @@ entry:
   store atomic i32 0, ptr addrspace(5) %alloca release, align 4
   store atomic i32 1, ptr addrspace(5) %a1 release, align 4
   store atomic i32 2, ptr addrspace(5) %a2  release, align 4
-  %tmp = getelementptr [3 x i32], ptr addrspace(5) %alloca, i64 0, i64 %index
+  %tmp = getelementptr [3 x i32], ptr addrspace(5) %alloca, i32 0, i32 %index
   %data = load i32, ptr addrspace(5) %tmp
   store i32 %data, ptr addrspace(1) %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll b/llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll
index 9c05f4d16cb4e..4a29f7e53e93a 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll
@@ -72,7 +72,8 @@ entry:
 ; OPT-NOT:   alloca
 ; OPT: bb2:
 ; OPT:  %promotealloca = phi <6 x float> [ zeroinitializer, %bb ], [ %0, %bb2 ]
-; OPT:  %0 = insertelement <6 x float> %promotealloca, float %tmp71, i32 %tmp10
+; OPT: [[TMP:%tmp7.*]] = load float, ptr addrspace(1) %tmp5, align 4
+; OPT:  %0 = insertelement <6 x float> %promotealloca, float [[TMP]], i32 %tmp10
 ; OPT: .preheader:
 ; OPT:  %bc = bitcast <6 x float> %0 to <6 x i32>
 ; OPT:  %1 = extractelement <6 x i32> %bc, i32 %tmp20
@@ -132,7 +133,8 @@ bb15:                                             ; preds = %.preheader
 ; OPT-NOT:   alloca
 ; OPT: bb2:
 ; OPT:  %promotealloca = phi <6 x double> [ zeroinitializer, %bb ], [ %0, %bb2 ]
-; OPT:  %0 = insertelement <6 x double> %promotealloca, double %tmp71, i32 %tmp10
+; OPT:  [[TMP:%tmp7.*]] = load double, ptr addrspace(1) %tmp5, align 8
+; OPT:  %0 = insertelement <6 x double> %promotealloca, double [[TMP]], i32 %tmp10
 ; OPT: .preheader:
 ; OPT:  %bc = bitcast <6 x double> %0 to <6 x i64>
 ; OPT:  %1 = extractelement <6 x i64> %bc, i32 %tmp20