diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index 29591ddd669c9..9293f8954cfe2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -469,6 +469,7 @@ static Value *promoteAllocaUserToVector(
       assert(AccessSize.isKnownMultipleOf(DL.getTypeStoreSize(VecEltTy)));
       const unsigned NumWrittenElts =
           AccessSize / DL.getTypeStoreSize(VecEltTy);
+      const unsigned NumVecElts = VectorTy->getNumElements();
       auto *SubVecTy = FixedVectorType::get(VecEltTy, NumWrittenElts);
       assert(DL.getTypeStoreSize(SubVecTy) == DL.getTypeStoreSize(AccessTy));
 
@@ -480,7 +481,8 @@ static Value *promoteAllocaUserToVector(
       Val = Builder.CreateBitOrPointerCast(Val, SubVecTy);
 
       Value *CurVec = GetOrLoadCurrentVectorValue();
-      for (unsigned K = 0; K < NumWrittenElts; ++K) {
+      for (unsigned K = 0, NumElts = std::min(NumWrittenElts, NumVecElts);
+           K < NumElts; ++K) {
         Value *CurIdx =
             Builder.CreateAdd(Index, ConstantInt::get(Index->getType(), K));
         CurVec = Builder.CreateInsertElement(
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-subvecs.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-subvecs.ll
index 765aa250a48f4..7c5410004ed5b 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-subvecs.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-subvecs.ll
@@ -458,3 +458,30 @@ finally:
   %load = load <4 x i16>, ptr addrspace(5) %ptr.2, align 2
   ret <4 x i16> %load
 }
+
+
+; Check the case when the alloca is smaller than the vector size.
+define void @test_smaller_alloca_store(<4 x i32> %store1, <4 x i32> %store2) {
+; CHECK-LABEL: define void @test_smaller_alloca_store
+; CHECK-SAME: (<4 x i32> [[STORE1:%.*]], <4 x i32> [[STORE2:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <4 x i32> [[STORE1]], i64 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <3 x i32> undef, i32 [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[STORE1]], i64 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <3 x i32> [[TMP1]], i32 [[TMP2]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[STORE1]], i64 2
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <3 x i32> [[TMP3]], i32 [[TMP4]], i32 2
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[STORE2]], i64 0
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <3 x i32> [[TMP5]], i32 [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x i32> [[STORE2]], i64 1
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <3 x i32> [[TMP7]], i32 [[TMP8]], i32 1
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x i32> [[STORE2]], i64 2
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <3 x i32> [[TMP9]], i32 [[TMP10]], i32 2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %res = alloca <3 x i32>, align 16, addrspace(5)
+  store <4 x i32> %store1, ptr addrspace(5) %res, align 16
+  store <4 x i32> %store2, ptr addrspace(5) %res, align 16
+  ret void
+}