diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp
index d07fb1ed3f344..68d77f76666f6 100644
--- a/clang/lib/CodeGen/CGCall.cpp
+++ b/clang/lib/CodeGen/CGCall.cpp
@@ -5279,30 +5279,50 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
             dyn_cast<llvm::StructType>(ArgInfo.getCoerceToType());
       if (STy && ArgInfo.isDirect() && ArgInfo.getCanBeFlattened()) {
         llvm::Type *SrcTy = Src.getElementType();
-        uint64_t SrcSize = CGM.getDataLayout().getTypeAllocSize(SrcTy);
-        uint64_t DstSize = CGM.getDataLayout().getTypeAllocSize(STy);
-
-        // If the source type is smaller than the destination type of the
-        // coerce-to logic, copy the source value into a temp alloca the size
-        // of the destination type to allow loading all of it. The bits past
-        // the source value are left undef.
-        if (SrcSize < DstSize) {
-          Address TempAlloca
-            = CreateTempAlloca(STy, Src.getAlignment(),
-                               Src.getName() + ".coerce");
-          Builder.CreateMemCpy(TempAlloca, Src, SrcSize);
-          Src = TempAlloca;
+        llvm::TypeSize SrcTypeSize =
+            CGM.getDataLayout().getTypeAllocSize(SrcTy);
+        llvm::TypeSize DstTypeSize = CGM.getDataLayout().getTypeAllocSize(STy);
+        if (SrcTypeSize.isScalable()) {
+          assert(STy->containsHomogeneousScalableVectorTypes() &&
+                 "ABI only supports structure with homogeneous scalable vector "
+                 "type");
+          assert(SrcTypeSize == DstTypeSize &&
+                 "Only allow non-fractional movement of structure with "
+                 "homogeneous scalable vector type");
+          assert(NumIRArgs == STy->getNumElements());
+
+          llvm::Value *StoredStructValue =
+              Builder.CreateLoad(Src, Src.getName() + ".tuple");
+          for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
+            llvm::Value *Extract = Builder.CreateExtractValue(
+                StoredStructValue, i, Src.getName() + ".extract" + Twine(i));
+            IRCallArgs[FirstIRArg + i] = Extract;
+          }
         } else {
-          Src = Src.withElementType(STy);
-        }
+          uint64_t SrcSize = SrcTypeSize.getFixedValue();
+          uint64_t DstSize = DstTypeSize.getFixedValue();
+
+          // If the source type is smaller than the destination type of the
+          // coerce-to logic, copy the source value into a temp alloca the size
+          // of the destination type to allow loading all of it. The bits past
+          // the source value are left undef.
+          if (SrcSize < DstSize) {
+            Address TempAlloca = CreateTempAlloca(STy, Src.getAlignment(),
+                                                  Src.getName() + ".coerce");
+            Builder.CreateMemCpy(TempAlloca, Src, SrcSize);
+            Src = TempAlloca;
+          } else {
+            Src = Src.withElementType(STy);
+          }
 
-        assert(NumIRArgs == STy->getNumElements());
-        for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
-          Address EltPtr = Builder.CreateStructGEP(Src, i);
-          llvm::Value *LI = Builder.CreateLoad(EltPtr);
-          if (ArgHasMaybeUndefAttr)
-            LI = Builder.CreateFreeze(LI);
-          IRCallArgs[FirstIRArg + i] = LI;
+          assert(NumIRArgs == STy->getNumElements());
+          for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
+            Address EltPtr = Builder.CreateStructGEP(Src, i);
+            llvm::Value *LI = Builder.CreateLoad(EltPtr);
+            if (ArgHasMaybeUndefAttr)
+              LI = Builder.CreateFreeze(LI);
+            IRCallArgs[FirstIRArg + i] = LI;
+          }
         }
       } else {
         // In the simple case, just pass the coerced loaded value.
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-handcrafted/rvv-tuple-type.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-handcrafted/rvv-tuple-type.c
index f4235795a8622..f8d755992eeac 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-handcrafted/rvv-tuple-type.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-handcrafted/rvv-tuple-type.c
@@ -90,3 +90,36 @@ void baz(__rvv_int32m1x2_t v_tuple) {
 __rvv_int32m1x2_t qux(__rvv_int32m1x2_t v_tuple) {
   return v_tuple;
 }
+
+// O0-LABEL: define dso_local { <vscale x 2 x i32>, <vscale x 2 x i32> } @quux
+// O0-SAME: (<vscale x 2 x i32> [[V_TUPLE_COERCE0:%.*]], <vscale x 2 x i32> [[V_TUPLE_COERCE1:%.*]]) #[[ATTR0]] {
+// O0-NEXT:  entry:
+// O0-NEXT:    [[V_TUPLE:%.*]] = alloca { <vscale x 2 x i32>, <vscale x 2 x i32> }, align 4
+// O0-NEXT:    [[V_TUPLE_ADDR:%.*]] = alloca { <vscale x 2 x i32>, <vscale x 2 x i32> }, align 4
+// O0-NEXT:    [[COERCE:%.*]] = alloca { <vscale x 2 x i32>, <vscale x 2 x i32> }, align 4
+// O0-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } poison, <vscale x 2 x i32> [[V_TUPLE_COERCE0]], 0
+// O0-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP0]], <vscale x 2 x i32> [[V_TUPLE_COERCE1]], 1
+// O0-NEXT:    store { <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP1]], ptr [[V_TUPLE]], align 4
+// O0-NEXT:    [[V_TUPLE1:%.*]] = load { <vscale x 2 x i32>, <vscale x 2 x i32> }, ptr [[V_TUPLE]], align 4
+// O0-NEXT:    store { <vscale x 2 x i32>, <vscale x 2 x i32> } [[V_TUPLE1]], ptr [[V_TUPLE_ADDR]], align 4
+// O0-NEXT:    [[TMP2:%.*]] = load { <vscale x 2 x i32>, <vscale x 2 x i32> }, ptr [[V_TUPLE_ADDR]], align 4
+// O0-NEXT:    store { <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP2]], ptr [[COERCE]], align 4
+// O0-NEXT:    [[COERCE_TUPLE:%.*]] = load { <vscale x 2 x i32>, <vscale x 2 x i32> }, ptr [[COERCE]], align 4
+// O0-NEXT:    [[COERCE_EXTRACT0:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } [[COERCE_TUPLE]], 0
+// O0-NEXT:    [[COERCE_EXTRACT1:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } [[COERCE_TUPLE]], 1
+// O0-NEXT:    [[CALL:%.*]] = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @qux(<vscale x 2 x i32> [[COERCE_EXTRACT0]], <vscale x 2 x i32> [[COERCE_EXTRACT1]])
+// O0-NEXT:    ret { <vscale x 2 x i32>, <vscale x 2 x i32> } [[CALL]]
+//
+// AFTER_MEM2REG-LABEL: define dso_local { <vscale x 2 x i32>, <vscale x 2 x i32> } @quux
+// AFTER_MEM2REG-SAME: (<vscale x 2 x i32> [[V_TUPLE_COERCE0:%.*]], <vscale x 2 x i32> [[V_TUPLE_COERCE1:%.*]]) #[[ATTR0]] {
+// AFTER_MEM2REG-NEXT:  entry:
+// AFTER_MEM2REG-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } poison, <vscale x 2 x i32> [[V_TUPLE_COERCE0]], 0
+// AFTER_MEM2REG-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP0]], <vscale x 2 x i32> [[V_TUPLE_COERCE1]], 1
+// AFTER_MEM2REG-NEXT:    [[COERCE_EXTRACT0:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP1]], 0
+// AFTER_MEM2REG-NEXT:    [[COERCE_EXTRACT1:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP1]], 1
+// AFTER_MEM2REG-NEXT:    [[CALL:%.*]] = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @qux(<vscale x 2 x i32> [[COERCE_EXTRACT0]], <vscale x 2 x i32> [[COERCE_EXTRACT1]])
+// AFTER_MEM2REG-NEXT:    ret { <vscale x 2 x i32>, <vscale x 2 x i32> } [[CALL]]
+//
+__rvv_int32m1x2_t quux(__rvv_int32m1x2_t v_tuple) {
+  return qux(v_tuple);
+}
diff --git a/llvm/test/Transforms/SROA/scalable-vector-struct.ll b/llvm/test/Transforms/SROA/scalable-vector-struct.ll
index 92cd44d2b5ac3..1af4fbbd9254b 100644
--- a/llvm/test/Transforms/SROA/scalable-vector-struct.ll
+++ b/llvm/test/Transforms/SROA/scalable-vector-struct.ll
@@ -20,3 +20,34 @@ define %struct.test @alloca(<vscale x 1 x i32> %x, <vscale x 1 x i32> %y) {
   %val = load %struct.test, %struct.test* %addr, align 4
   ret %struct.test %val
 }
+
+
+define { <vscale x 2 x i32>, <vscale x 2 x i32> } @return_tuple(<vscale x 2 x i32> %v_tuple.coerce0, <vscale x 2 x i32> %v_tuple.coerce1) {
+; CHECK-LABEL: @return_tuple(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } poison, <vscale x 2 x i32> [[V_TUPLE_COERCE0:%.*]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP0]], <vscale x 2 x i32> [[V_TUPLE_COERCE1:%.*]], 1
+; CHECK-NEXT:    [[COERCE_EXTRACT0:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP1]], 0
+; CHECK-NEXT:    [[COERCE_EXTRACT1:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP1]], 1
+; CHECK-NEXT:    [[CALL:%.*]] = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @foo(<vscale x 2 x i32> [[COERCE_EXTRACT0]], <vscale x 2 x i32> [[COERCE_EXTRACT1]])
+; CHECK-NEXT:    ret { <vscale x 2 x i32>, <vscale x 2 x i32> } [[CALL]]
+;
+entry:
+  %v_tuple = alloca { <vscale x 2 x i32>, <vscale x 2 x i32> }, align 4
+  %v_tuple.addr = alloca { <vscale x 2 x i32>, <vscale x 2 x i32> }, align 4
+  %coerce = alloca { <vscale x 2 x i32>, <vscale x 2 x i32> }, align 4
+  %0 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } poison, <vscale x 2 x i32> %v_tuple.coerce0, 0
+  %1 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %0, <vscale x 2 x i32> %v_tuple.coerce1, 1
+  store { <vscale x 2 x i32>, <vscale x 2 x i32> } %1, ptr %v_tuple, align 4
+  %v_tuple1 = load { <vscale x 2 x i32>, <vscale x 2 x i32> }, ptr %v_tuple, align 4
+  store { <vscale x 2 x i32>, <vscale x 2 x i32> } %v_tuple1, ptr %v_tuple.addr, align 4
+  %2 = load { <vscale x 2 x i32>, <vscale x 2 x i32> }, ptr %v_tuple.addr, align 4
+  store { <vscale x 2 x i32>, <vscale x 2 x i32> } %2, ptr %coerce, align 4
+  %coerce.tuple = load { <vscale x 2 x i32>, <vscale x 2 x i32> }, ptr %coerce, align 4
+  %coerce.extract0 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %coerce.tuple, 0
+  %coerce.extract1 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %coerce.tuple, 1
+  %call = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @foo(<vscale x 2 x i32> %coerce.extract0, <vscale x 2 x i32> %coerce.extract1)
+  ret { <vscale x 2 x i32>, <vscale x 2 x i32> } %call
+}
+
+declare { <vscale x 2 x i32>, <vscale x 2 x i32> } @foo(<vscale x 2 x i32>, <vscale x 2 x i32>)