diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 8410f7201c486b..3502ae90d69595 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -3768,11 +3768,24 @@ int BoUpSLP::getSpillCost() const {
   SmallPtrSet<Instruction*, 4> LiveValues;
   Instruction *PrevInst = nullptr;
 
+  // The entries in VectorizableTree are not necessarily ordered by their
+  // position in basic blocks. Collect them and order them by dominance so later
+  // instructions are guaranteed to be visited first. For instructions in
+  // different basic blocks, we only scan to the beginning of the block, so
+  // their order does not matter, as long as all instructions in a basic block
+  // are grouped together. Using dominance ensures a deterministic order.
+  SmallVector<Instruction *, 16> OrderedScalars;
   for (const auto &TEPtr : VectorizableTree) {
     Instruction *Inst = dyn_cast<Instruction>(TEPtr->Scalars[0]);
     if (!Inst)
       continue;
+    OrderedScalars.push_back(Inst);
+  }
+  llvm::stable_sort(OrderedScalars, [this](Instruction *A, Instruction *B) {
+    return DT->dominates(B, A);
+  });
 
+  for (Instruction *Inst : OrderedScalars) {
     if (!PrevInst) {
       PrevInst = Inst;
       continue;
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/spillcost-order.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/spillcost-order.ll
index 9286b7ce9a6902..8e0ca4b2938416 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/spillcost-order.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/spillcost-order.ll
@@ -13,22 +13,19 @@ define void @test(i64* %ptr, i64* noalias %res) {
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[CALL_I_I:%.*]] = call i32* @get_ptr()
-; CHECK-NEXT:    [[L_0_0:%.*]] = load i32, i32* [[CALL_I_I]], align 2
 ; CHECK-NEXT:    [[GEP_1:%.*]] = getelementptr i32, i32* [[CALL_I_I]], i32 2
-; CHECK-NEXT:    [[L_1_0:%.*]] = load i32, i32* [[GEP_1]], align 2
-; CHECK-NEXT:    [[EXT_0_0:%.*]] = zext i32 [[L_0_0]] to i64
-; CHECK-NEXT:    [[EXT_1_0:%.*]] = zext i32 [[L_1_0]] to i64
-; CHECK-NEXT:    [[SUB_1:%.*]] = sub nsw i64 [[EXT_0_0]], [[EXT_1_0]]
 ; CHECK-NEXT:    [[GEP_2:%.*]] = getelementptr i32, i32* [[CALL_I_I]], i32 1
-; CHECK-NEXT:    [[L_0_1:%.*]] = load i32, i32* [[GEP_2]], align 2
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[CALL_I_I]] to <2 x i32>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* [[TMP0]], align 2
 ; CHECK-NEXT:    [[GEP_3:%.*]] = getelementptr i32, i32* [[CALL_I_I]], i32 3
-; CHECK-NEXT:    [[L_1_1:%.*]] = load i32, i32* [[GEP_3]], align 2
-; CHECK-NEXT:    [[EXT_0_1:%.*]] = zext i32 [[L_0_1]] to i64
-; CHECK-NEXT:    [[EXT_1_1:%.*]] = zext i32 [[L_1_1]] to i64
-; CHECK-NEXT:    [[SUB_2:%.*]] = sub nsw i64 [[EXT_0_1]], [[EXT_1_1]]
-; CHECK-NEXT:    store i64 [[SUB_1]], i64* [[RES:%.*]], align 8
-; CHECK-NEXT:    [[RES_1:%.*]] = getelementptr i64, i64* [[RES]], i64 1
-; CHECK-NEXT:    store i64 [[SUB_2]], i64* [[RES_1]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32* [[GEP_1]] to <2 x i32>*
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[TMP2]], align 2
+; CHECK-NEXT:    [[TMP4:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = zext <2 x i32> [[TMP3]] to <2 x i64>
+; CHECK-NEXT:    [[TMP6:%.*]] = sub nsw <2 x i64> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[RES_1:%.*]] = getelementptr i64, i64* [[RES:%.*]], i64 1
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i64* [[RES]] to <2 x i64>*
+; CHECK-NEXT:    store <2 x i64> [[TMP6]], <2 x i64>* [[TMP7]], align 8
 ; CHECK-NEXT:    [[C:%.*]] = call i1 @cond()
 ; CHECK-NEXT:    br i1 [[C]], label [[FOR_BODY]], label [[EXIT:%.*]]
 ; CHECK:       exit: