diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index 85bcb3c7d0982..4246ec671a0f5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -334,86 +334,49 @@ static FixedVectorType *arrayTypeToVecType(ArrayType *ArrayTy) {
                               ArrayTy->getNumElements());
 }
 
-static Value *stripBitcasts(Value *V) {
-  while (Instruction *I = dyn_cast<Instruction>(V)) {
-    if (I->getOpcode() != Instruction::BitCast)
-      break;
-    V = I->getOperand(0);
-  }
-  return V;
-}
-
 static Value *
 calculateVectorIndex(Value *Ptr,
                      const std::map<GetElementPtrInst *, Value *> &GEPIdx) {
-  GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(stripBitcasts(Ptr));
+  auto *GEP = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts());
   if (!GEP)
-    return nullptr;
+    return ConstantInt::getNullValue(Type::getInt32Ty(Ptr->getContext()));
 
   auto I = GEPIdx.find(GEP);
-  return I == GEPIdx.end() ? nullptr : I->second;
+  assert(I != GEPIdx.end() && "Must have entry for GEP!");
+  return I->second;
 }
 
-static Value* GEPToVectorIndex(GetElementPtrInst *GEP) {
-  // FIXME we only support simple cases
-  if (GEP->getNumOperands() != 3)
+static Value *GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca,
+                               Type *VecElemTy, const DataLayout &DL) {
+  // TODO: Extracting a "multiple of X" from a GEP might be a useful generic
+  // helper.
+  unsigned BW = DL.getIndexTypeSizeInBits(GEP->getType());
+  MapVector<Value *, APInt> VarOffsets;
+  APInt ConstOffset(BW, 0);
+  if (GEP->getPointerOperand()->stripPointerCasts() != Alloca ||
+      !GEP->collectOffset(DL, BW, VarOffsets, ConstOffset))
     return nullptr;
 
-  ConstantInt *I0 = dyn_cast<ConstantInt>(GEP->getOperand(1));
-  if (!I0 || !I0->isZero())
+  unsigned VecElemSize = DL.getTypeAllocSize(VecElemTy);
+  if (VarOffsets.size() > 1)
     return nullptr;
 
-  return GEP->getOperand(2);
-}
-
-// Not an instruction handled below to turn into a vector.
-//
-// TODO: Check isTriviallyVectorizable for calls and handle other
-// instructions.
-static bool canVectorizeInst(Instruction *Inst, User *User,
-                             const DataLayout &DL) {
-  switch (Inst->getOpcode()) {
-  case Instruction::Load: {
-    // Currently only handle the case where the Pointer Operand is a GEP.
-    // Also we could not vectorize volatile or atomic loads.
-    LoadInst *LI = cast<LoadInst>(Inst);
-    if (isa<AllocaInst>(User) &&
-        LI->getPointerOperandType() == User->getType() &&
-        isa<VectorType>(LI->getType()))
-      return true;
-
-    Instruction *PtrInst = dyn_cast<Instruction>(LI->getPointerOperand());
-    if (!PtrInst)
-      return false;
-
-    return (PtrInst->getOpcode() == Instruction::GetElementPtr ||
-            PtrInst->getOpcode() == Instruction::BitCast) &&
-           LI->isSimple();
+  if (VarOffsets.size() == 1) {
+    // Only handle cases where we don't need to insert extra arithmetic
+    // instructions.
+    const auto &VarOffset = VarOffsets.front();
+    if (!ConstOffset.isZero() || VarOffset.second != VecElemSize)
+      return nullptr;
+    return VarOffset.first;
   }
-  case Instruction::BitCast:
-    return true;
-  case Instruction::Store: {
-    // Must be the stored pointer operand, not a stored value, plus
-    // since it should be canonical form, the User should be a GEP.
-    // Also we could not vectorize volatile or atomic stores.
-    StoreInst *SI = cast<StoreInst>(Inst);
-    if (isa<AllocaInst>(User) &&
-        SI->getPointerOperandType() == User->getType() &&
-        isa<VectorType>(SI->getValueOperand()->getType()))
-      return true;
-
-    Instruction *UserInst = dyn_cast<Instruction>(User);
-    if (!UserInst)
-      return false;
 
-    return (SI->getPointerOperand() == User) &&
-           (UserInst->getOpcode() == Instruction::GetElementPtr ||
-            UserInst->getOpcode() == Instruction::BitCast) &&
-           SI->isSimple();
-  }
-  default:
-    return false;
-  }
+  APInt Quot;
+  uint64_t Rem;
+  APInt::udivrem(ConstOffset, VecElemSize, Quot, Rem);
+  if (Rem != 0)
+    return nullptr;
+
+  return ConstantInt::get(GEP->getContext(), Quot);
 }
 
 static bool tryPromoteAllocaToVector(AllocaInst *Alloca, const DataLayout &DL,
@@ -455,73 +418,87 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca, const DataLayout &DL,
   }
 
   std::map<GetElementPtrInst*, Value*> GEPVectorIdx;
-  std::vector<Value *> WorkList;
-  SmallVector<User *, 8> Users(Alloca->users());
-  SmallVector<User *, 8> UseUsers(Users.size(), Alloca);
+  SmallVector<Instruction *> WorkList;
+  SmallVector<Use *, 8> Uses;
+  for (Use &U : Alloca->uses())
+    Uses.push_back(&U);
+
   Type *VecEltTy = VectorTy->getElementType();
-  while (!Users.empty()) {
-    User *AllocaUser = Users.pop_back_val();
-    User *UseUser = UseUsers.pop_back_val();
-    Instruction *Inst = dyn_cast<Instruction>(AllocaUser);
-
-    GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(AllocaUser);
-    if (!GEP) {
-      if (!canVectorizeInst(Inst, UseUser, DL))
+  while (!Uses.empty()) {
+    Use *U = Uses.pop_back_val();
+    Instruction *Inst = dyn_cast<Instruction>(U->getUser());
+
+    if (Value *Ptr = getLoadStorePointerOperand(Inst)) {
+      // This is a store of the pointer, not to the pointer.
+      if (isa<StoreInst>(Inst) &&
+          U->getOperandNo() != StoreInst::getPointerOperandIndex())
         return false;
 
-      if (Inst->getOpcode() == Instruction::BitCast) {
-        Type *FromTy = Inst->getOperand(0)->getType()->getPointerElementType();
-        Type *ToTy = Inst->getType()->getPointerElementType();
-        if (FromTy->isAggregateType() || ToTy->isAggregateType() ||
-            DL.getTypeSizeInBits(FromTy) != DL.getTypeSizeInBits(ToTy))
-          continue;
-
-        for (User *CastUser : Inst->users()) {
-          if (isAssumeLikeIntrinsic(cast<Instruction>(CastUser)))
-            continue;
-          Users.push_back(CastUser);
-          UseUsers.push_back(Inst);
-        }
+      Type *AccessTy = getLoadStoreType(Inst);
+      Ptr = Ptr->stripPointerCasts();
 
+      // Alloca already accessed as vector, leave alone.
+      if (Ptr == Alloca && DL.getTypeStoreSize(Alloca->getAllocatedType()) ==
+                               DL.getTypeStoreSize(AccessTy))
         continue;
-      }
 
-      WorkList.push_back(AllocaUser);
+      // Check that this is a simple access of a vector element.
+      bool IsSimple = isa<LoadInst>(Inst) ? cast<LoadInst>(Inst)->isSimple()
+                                          : cast<StoreInst>(Inst)->isSimple();
+      if (!IsSimple ||
+          !CastInst::isBitOrNoopPointerCastable(VecEltTy, AccessTy, DL))
+        return false;
+
+      WorkList.push_back(Inst);
       continue;
     }
 
-    Value *Index = GEPToVectorIndex(GEP);
+    if (isa<BitCastInst>(Inst)) {
+      // Look through bitcasts.
+      for (Use &U : Inst->uses())
+        Uses.push_back(&U);
+      continue;
+    }
 
-    // If we can't compute a vector index from this GEP, then we can't
-    // promote this alloca to vector.
-    if (!Index) {
-      LLVM_DEBUG(dbgs() << "  Cannot compute vector index for GEP " << *GEP
-                        << '\n');
-      return false;
+    if (auto *GEP = dyn_cast<GetElementPtrInst>(Inst)) {
+      // If we can't compute a vector index from this GEP, then we can't
+      // promote this alloca to vector.
+      Value *Index = GEPToVectorIndex(GEP, Alloca, VecEltTy, DL);
+      if (!Index) {
+        LLVM_DEBUG(dbgs() << "  Cannot compute vector index for GEP " << *GEP
+                          << '\n');
+        return false;
+      }
+
+      GEPVectorIdx[GEP] = Index;
+      for (Use &U : Inst->uses())
+        Uses.push_back(&U);
+      continue;
     }
 
-    GEPVectorIdx[GEP] = Index;
-    Users.append(GEP->user_begin(), GEP->user_end());
-    UseUsers.append(GEP->getNumUses(), GEP);
+    // Ignore assume-like intrinsics and comparisons used in assumes.
+    if (isAssumeLikeIntrinsic(Inst))
+      continue;
+
+    if (isa<ICmpInst>(Inst) && all_of(Inst->users(), [](User *U) {
+          return isAssumeLikeIntrinsic(cast<Instruction>(U));
+        }))
+      continue;
+
+    // Unknown user.
+    return false;
   }
 
   LLVM_DEBUG(dbgs() << "  Converting alloca to vector " << *AllocaTy << " -> "
                     << *VectorTy << '\n');
 
-  for (Value *V : WorkList) {
-    Instruction *Inst = cast<Instruction>(V);
+  for (Instruction *Inst : WorkList) {
     IRBuilder<> Builder(Inst);
     switch (Inst->getOpcode()) {
     case Instruction::Load: {
-      if (Inst->getType() == AllocaTy || Inst->getType()->isVectorTy())
-        break;
-
       Value *Ptr = cast<LoadInst>(Inst)->getPointerOperand();
       Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
-      if (!Index)
-        break;
-
-      Type *VecPtrTy = VectorTy->getPointerTo(AMDGPUAS::PRIVATE_ADDRESS);
+      Type *VecPtrTy = VectorTy->getPointerTo(Alloca->getAddressSpace());
       Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy);
       Value *VecValue = Builder.CreateLoad(VectorTy, BitCast);
       Value *ExtractElement = Builder.CreateExtractElement(VecValue, Index);
@@ -533,16 +510,9 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca, const DataLayout &DL,
     }
     case Instruction::Store: {
       StoreInst *SI = cast<StoreInst>(Inst);
-      if (SI->getValueOperand()->getType() == AllocaTy ||
-          SI->getValueOperand()->getType()->isVectorTy())
-        break;
-
       Value *Ptr = SI->getPointerOperand();
       Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
-      if (!Index)
-        break;
-
-      Type *VecPtrTy = VectorTy->getPointerTo(AMDGPUAS::PRIVATE_ADDRESS);
+      Type *VecPtrTy = VectorTy->getPointerTo(Alloca->getAddressSpace());
       Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy);
       Value *VecValue = Builder.CreateLoad(VectorTy, BitCast);
       Value *Elt = SI->getValueOperand();
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-array-aggregate.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-array-aggregate.ll
index d9db49e0767d4..3e7b4c99f74b6 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-array-aggregate.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-array-aggregate.ll
@@ -72,9 +72,15 @@ define amdgpu_vs void @promote_store_aggr() #0 {
 ; CHECK-NEXT:    [[FOO2:%.*]] = load i32, i32* [[I]], align 4
 ; CHECK-NEXT:    [[FOO3:%.*]] = sitofp i32 [[FOO2]] to float
 ; CHECK-NEXT:    [[FOO4:%.*]] = getelementptr [2 x float], [2 x float]* [[F1]], i32 0, i32 0
-; CHECK-NEXT:    store float [[FOO3]], float* [[FOO4]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast [2 x float]* [[F1]] to <2 x float>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[FOO3]], i32 0
+; CHECK-NEXT:    store <2 x float> [[TMP3]], <2 x float>* [[TMP1]], align 8
 ; CHECK-NEXT:    [[FOO5:%.*]] = getelementptr [2 x float], [2 x float]* [[F1]], i32 0, i32 1
-; CHECK-NEXT:    store float 2.000000e+00, float* [[FOO5]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast [2 x float]* [[F1]] to <2 x float>*
+; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x float>, <2 x float>* [[TMP4]], align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x float> [[TMP5]], float 2.000000e+00, i64 1
+; CHECK-NEXT:    store <2 x float> [[TMP6]], <2 x float>* [[TMP4]], align 8
 ; CHECK-NEXT:    [[FOO6:%.*]] = load [2 x float], [2 x float]* [[F1]], align 4
 ; CHECK-NEXT:    [[FOO7:%.*]] = getelementptr [[BLOCK2]], [[BLOCK2]] addrspace(1)* @block2, i32 0, i32 1
 ; CHECK-NEXT:    store [2 x float] [[FOO6]], [2 x float] addrspace(1)* [[FOO7]], align 4
@@ -116,13 +122,15 @@ define amdgpu_vs void @promote_load_from_store_aggr() #0 {
 ; CHECK-NEXT:    store [2 x float] [[FOO3]], [2 x float]* [[F1]], align 4
 ; CHECK-NEXT:    [[FOO4:%.*]] = load i32, i32* [[I]], align 4
 ; CHECK-NEXT:    [[FOO5:%.*]] = getelementptr [2 x float], [2 x float]* [[F1]], i32 0, i32 [[FOO4]]
-; CHECK-NEXT:    [[FOO6:%.*]] = load float, float* [[FOO5]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast [2 x float]* [[F1]] to <2 x float>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 [[FOO4]]
 ; CHECK-NEXT:    [[FOO7:%.*]] = alloca <4 x float>, align 16
 ; CHECK-NEXT:    [[FOO8:%.*]] = load <4 x float>, <4 x float>* [[FOO7]], align 16
-; CHECK-NEXT:    [[FOO9:%.*]] = insertelement <4 x float> [[FOO8]], float [[FOO6]], i32 0
-; CHECK-NEXT:    [[FOO10:%.*]] = insertelement <4 x float> [[FOO9]], float [[FOO6]], i32 1
-; CHECK-NEXT:    [[FOO11:%.*]] = insertelement <4 x float> [[FOO10]], float [[FOO6]], i32 2
-; CHECK-NEXT:    [[FOO12:%.*]] = insertelement <4 x float> [[FOO11]], float [[FOO6]], i32 3
+; CHECK-NEXT:    [[FOO9:%.*]] = insertelement <4 x float> [[FOO8]], float [[TMP3]], i32 0
+; CHECK-NEXT:    [[FOO10:%.*]] = insertelement <4 x float> [[FOO9]], float [[TMP3]], i32 1
+; CHECK-NEXT:    [[FOO11:%.*]] = insertelement <4 x float> [[FOO10]], float [[TMP3]], i32 2
+; CHECK-NEXT:    [[FOO12:%.*]] = insertelement <4 x float> [[FOO11]], float [[TMP3]], i32 3
 ; CHECK-NEXT:    [[FOO13:%.*]] = getelementptr [[GL_PERVERTEX:%.*]], [[GL_PERVERTEX]] addrspace(1)* @pv, i32 0, i32 0
 ; CHECK-NEXT:    store <4 x float> [[FOO12]], <4 x float> addrspace(1)* [[FOO13]], align 16
 ; CHECK-NEXT:    ret void
@@ -163,17 +171,28 @@ define amdgpu_ps void @promote_double_aggr() #0 {
 ; CHECK-NEXT:    [[FOO5:%.*]] = insertvalue [2 x double] [[FOO4]], double [[FOO3]], 1
 ; CHECK-NEXT:    store [2 x double] [[FOO5]], [2 x double]* [[S]], align 8
 ; CHECK-NEXT:    [[FOO6:%.*]] = getelementptr [2 x double], [2 x double]* [[S]], i32 0, i32 1
-; CHECK-NEXT:    [[FOO7:%.*]] = load double, double* [[FOO6]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast [2 x double]* [[S]] to <2 x double>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 16
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[TMP2]], i64 1
 ; CHECK-NEXT:    [[FOO8:%.*]] = getelementptr [2 x double], [2 x double]* [[S]], i32 0, i32 1
-; CHECK-NEXT:    [[FOO9:%.*]] = load double, double* [[FOO8]], align 8
-; CHECK-NEXT:    [[FOO10:%.*]] = fadd double [[FOO7]], [[FOO9]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast [2 x double]* [[S]] to <2 x double>*
+; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x double>, <2 x double>* [[TMP4]], align 16
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[TMP5]], i64 1
+; CHECK-NEXT:    [[FOO10:%.*]] = fadd double [[TMP3]], [[TMP6]]
 ; CHECK-NEXT:    [[FOO11:%.*]] = getelementptr [2 x double], [2 x double]* [[S]], i32 0, i32 0
-; CHECK-NEXT:    store double [[FOO10]], double* [[FOO11]], align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast [2 x double]* [[S]] to <2 x double>*
+; CHECK-NEXT:    [[TMP8:%.*]] = load <2 x double>, <2 x double>* [[TMP7]], align 16
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x double> [[TMP8]], double [[FOO10]], i32 0
+; CHECK-NEXT:    store <2 x double> [[TMP9]], <2 x double>* [[TMP7]], align 16
 ; CHECK-NEXT:    [[FOO12:%.*]] = getelementptr [2 x double], [2 x double]* [[S]], i32 0, i32 0
-; CHECK-NEXT:    [[FOO13:%.*]] = load double, double* [[FOO12]], align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast [2 x double]* [[S]] to <2 x double>*
+; CHECK-NEXT:    [[TMP11:%.*]] = load <2 x double>, <2 x double>* [[TMP10]], align 16
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x double> [[TMP11]], i32 0
 ; CHECK-NEXT:    [[FOO14:%.*]] = getelementptr [2 x double], [2 x double]* [[S]], i32 0, i32 1
-; CHECK-NEXT:    [[FOO15:%.*]] = load double, double* [[FOO14]], align 8
-; CHECK-NEXT:    [[FOO16:%.*]] = fadd double [[FOO13]], [[FOO15]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast [2 x double]* [[S]] to <2 x double>*
+; CHECK-NEXT:    [[TMP14:%.*]] = load <2 x double>, <2 x double>* [[TMP13]], align 16
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <2 x double> [[TMP14]], i64 1
+; CHECK-NEXT:    [[FOO16:%.*]] = fadd double [[TMP12]], [[TMP15]]
 ; CHECK-NEXT:    [[FOO17:%.*]] = fptrunc double [[FOO16]] to float
 ; CHECK-NEXT:    [[FOO18:%.*]] = insertelement <4 x float> undef, float [[FOO17]], i32 0
 ; CHECK-NEXT:    [[FOO19:%.*]] = insertelement <4 x float> [[FOO18]], float [[FOO17]], i32 1
diff --git a/llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll b/llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll
index 393e80f6be132..fe4355f691967 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll
@@ -281,14 +281,8 @@ bb13:                                             ; preds = %.preheader
 ; TODO: llvm.assume can be ingored
 
 ; OPT-LABEL: @vector_read_alloca_bitcast_assume(
-; OPT:      %tmp = alloca <4 x i32>, align 16, addrspace(5)
-; OPT-NEXT: %x = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(5)* %tmp, i64 0, i64 0
-; OPT-NEXT: store i32 0, i32 addrspace(5)* %x, align 16
-; OPT-NEXT: %0 = load <4 x i32>, <4 x i32> addrspace(5)* %tmp, align 16
-; OPT-NEXT: %1 = shufflevector <4 x i32> %0, <4 x i32> <i32 poison, i32 1, i32 2, i32 3>, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
-; OPT-NEXT: store <4 x i32> %1, <4 x i32> addrspace(5)* %tmp, align 16
-; OPT-NEXT: %2 = extractelement <4 x i32> %1, i32 %index
-; OPT-NEXT: store i32 %2, i32 addrspace(1)* %out, align 4
+; OPT: %0 = extractelement <4 x i32> <i32 0, i32 1, i32 2, i32 3>, i32 %index
+; OPT: store i32 %0, i32 addrspace(1)* %out, align 4
 
 ; GCN-LABEL: {{^}}vector_read_alloca_bitcast_assume:
 ; GCN-COUNT-4: buffer_store_dword
diff --git a/llvm/test/CodeGen/AMDGPU/vector-alloca.ll b/llvm/test/CodeGen/AMDGPU/vector-alloca.ll
index ad893f12f226f..ddcc53ce85569 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-alloca.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-alloca.ll
@@ -113,16 +113,9 @@ entry:
   ret void
 }
 
-; FIXME: Should be able to promote this. Instcombine should fold the
-; cast in the hasOneUse case so it might not matter in practice
-
 ; OPT-LABEL: @vector_read_bitcast_alloca(
-; OPT: alloca [4 x float]
-; OPT: store float
-; OPT: store float
-; OPT: store float
-; OPT: store float
-; OPT: load float
+; OPT: %0 = extractelement <4 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 4.000000e+00>, i32 %index
+; OPT: store float %0, float addrspace(1)* %out, align 4
 define amdgpu_kernel void @vector_read_bitcast_alloca(float addrspace(1)* %out, i32 %index) {
 entry:
   %tmp = alloca [4 x i32], addrspace(5)