diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 52b416fdd755b..27837e260ebc1 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -33214,6 +33214,37 @@ bool AArch64TargetLowering::SimplifyDemandedBitsForTargetNode(
       Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
 }
 
+bool AArch64TargetLowering::canCombineStoreAndExtract(Type *VectorTy,
+                                                      Value *Idx,
+                                                      unsigned &Cost) const {
+  // If we do not have NEON, fixed-width vector types are not natively
+  // supported.
+  if (!Subtarget->hasNEON())
+    return false;
+
+  // Floating point values and vector values map to the same register file.
+  // Therefore, although we could do a store extract of a vector type, this is
+  // better to leave at float as we have more freedom in the addressing mode for
+  // those.
+  if (VectorTy->isFPOrFPVectorTy() || VectorTy->isScalableTy())
+    return false;
+
+  // If the index is unknown at compile time, this is very expensive to lower
+  // and it is not possible to combine the store with the extract.
+  if (!isa<ConstantInt>(Idx))
+    return false;
+
+  assert(VectorTy->isVectorTy() && "VectorTy is not a vector type");
+  unsigned BitWidth = VectorTy->getPrimitiveSizeInBits().getFixedValue();
+  // We can do a store + vector extract on any vector that fits perfectly in a D
+  // or Q register.
+  if (BitWidth == 64 || BitWidth == 128) {
+    Cost = 0;
+    return true;
+  }
+  return false;
+}
+
 bool AArch64TargetLowering::canCreateUndefOrPoisonForTargetNode(
     SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
     bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const {
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 58efdd3e18fc0..50d4541405d36 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -894,6 +894,9 @@ class AArch64TargetLowering : public TargetLowering {
   bool shouldLocalize(const MachineInstr &MI,
                       const TargetTransformInfo *TTI) const override;
 
+  bool canCombineStoreAndExtract(Type *VectorTy, Value *Idx,
+                                 unsigned &Cost) const override;
+
   bool SimplifyDemandedBitsForTargetNode(SDValue Op,
                                          const APInt &OriginalDemandedBits,
                                          const APInt &OriginalDemandedElts,
diff --git a/llvm/test/CodeGen/AArch64/vector-promotion.ll b/llvm/test/CodeGen/AArch64/vector-promotion.ll
new file mode 100644
index 0000000000000..610f8ba0224e2
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/vector-promotion.ll
@@ -0,0 +1,69 @@
+; RUN: opt -passes='require<profile-summary>,function(codegenprepare)' -mtriple=aarch64 %s -o - -S | FileCheck --check-prefix=IR-BOTH --check-prefix=IR-NORMAL %s
+; RUN: opt -passes='require<profile-summary>,function(codegenprepare)' -mtriple=aarch64 %s -o - -S -stress-cgp-store-extract | FileCheck --check-prefix=IR-BOTH --check-prefix=IR-STRESS %s
+; RUN: llc -mtriple=aarch64 %s -o - | FileCheck --check-prefix=ASM %s
+
+; IR-BOTH-LABEL: @simpleOneInstructionPromotion
+; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x i32>, ptr %addr1
+; IR-BOTH-NEXT: [[VECTOR_OR:%[a-zA-Z_0-9-]+]] = or <2 x i32> [[LOAD]], <i32 poison, i32 1>
+; IR-BOTH-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[VECTOR_OR]], i32 1
+; IR-BOTH-NEXT: store i32 [[EXTRACT]], ptr %dest
+; IR-BOTH-NEXT: ret
+; ASM-LABEL: simpleOneInstructionPromotion:
+; ASM-NOT: umov
+define void @simpleOneInstructionPromotion(ptr %addr1, ptr %dest) {
+  %in1 = load <2 x i32>, ptr %addr1, align 8
+  %extract = extractelement <2 x i32> %in1, i32 1
+  %out = or i32 %extract, 1
+  store i32 %out, ptr %dest, align 4
+  ret void
+}
+
+; IR-BOTH-LABEL: @unsupportedInstructionForPromotion
+; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x i32>, ptr %addr1
+; IR-BOTH-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[LOAD]], i32 0
+; IR-BOTH-NEXT: [[CMP:%[a-zA-Z_0-9-]+]] = icmp eq i32 [[EXTRACT]], %in2
+; IR-BOTH-NEXT: store i1 [[CMP]], ptr %dest
+; IR-BOTH-NEXT: ret
+define void @unsupportedInstructionForPromotion(ptr %addr1, i32 %in2, ptr %dest) {
+  %in1 = load <2 x i32>, ptr %addr1, align 8
+  %extract = extractelement <2 x i32> %in1, i32 0
+  %out = icmp eq i32 %extract, %in2
+  store i1 %out, ptr %dest, align 4
+  ret void
+}
+
+; IR-BOTH-LABEL: @chainOfInstructionsToPromote
+; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x i32>, ptr %addr1
+; IR-BOTH-NEXT: [[VECTOR_OR1:%[a-zA-Z_0-9-]+]] = or <2 x i32> [[LOAD]], <i32 1, i32 poison>
+; IR-BOTH-NEXT: [[VECTOR_OR2:%[a-zA-Z_0-9-]+]] = or <2 x i32> [[VECTOR_OR1]], <i32 1, i32 poison>
+; IR-BOTH-NEXT: [[VECTOR_OR3:%[a-zA-Z_0-9-]+]] = or <2 x i32> [[VECTOR_OR2]], <i32 1, i32 poison>
+; IR-BOTH-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[VECTOR_OR3]], i32 0
+; IR-BOTH-NEXT: store i32 [[EXTRACT]], ptr %dest
+; IR-BOTH-NEXT: ret
+define void @chainOfInstructionsToPromote(ptr %addr1, ptr %dest) {
+  %in1 = load <2 x i32>, ptr %addr1, align 8
+  %extract = extractelement <2 x i32> %in1, i32 0
+  %out1 = or i32 %extract, 1
+  %out2 = or i32 %out1, 1
+  %out3 = or i32 %out2, 1
+  store i32 %out3, ptr %dest, align 4
+  ret void
+}
+
+; IR-BOTH-LABEL: @simpleOneInstructionPromotionVariableIdx
+; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x i32>, ptr %addr1
+; Scalar version:
+; IR-NORMAL-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[LOAD]], i32 %idx
+; IR-NORMAL-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = or i32 [[EXTRACT]], 1
+; Vector version:
+; IR-STRESS-NEXT: [[OR:%[a-zA-Z_0-9-]+]] = or <2 x i32> [[LOAD]], splat (i32 1)
+; IR-STRESS-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[OR]], i32 %idx
+; IR-BOTH-NEXT: store i32 [[RES]], ptr %dest
+; IR-BOTH-NEXT: ret
+define void @simpleOneInstructionPromotionVariableIdx(ptr %addr1, ptr %dest, i32 %idx) {
+  %in1 = load <2 x i32>, ptr %addr1, align 8
+  %extract = extractelement <2 x i32> %in1, i32 %idx
+  %out = or i32 %extract, 1
+  store i32 %out, ptr %dest, align 4
+  ret void
+}