diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
index 9449e70930913..1e730218722b5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
@@ -67,6 +67,8 @@ FUNCTION_PASS("amdgpu-simplifylib", AMDGPUSimplifyLibCallsPass())
 FUNCTION_PASS("amdgpu-unify-divergent-exit-nodes",
               AMDGPUUnifyDivergentExitNodesPass())
 FUNCTION_PASS("amdgpu-usenative", AMDGPUUseNativeCallsPass())
+FUNCTION_PASS("amdgpu-vector-idiom",
+              AMDGPUVectorIdiomCombinePass(/*MaxBytes=*/32))
 FUNCTION_PASS("si-annotate-control-flow", SIAnnotateControlFlowPass(*static_cast<const GCNTargetMachine *>(this)))
 #undef FUNCTION_PASS
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 92a587b5771b6..1249e25114e1f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -34,6 +34,7 @@
 #include "AMDGPUTargetObjectFile.h"
 #include "AMDGPUTargetTransformInfo.h"
 #include "AMDGPUUnifyDivergentExitNodes.h"
+#include "AMDGPUVectorIdiom.h"
 #include "AMDGPUWaitSGPRHazards.h"
 #include "GCNDPPCombine.h"
 #include "GCNIterativeScheduler.h"
@@ -905,6 +906,12 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
             EnablePromoteKernelArguments)
           FPM.addPass(AMDGPUPromoteKernelArgumentsPass());
 
+        // Run vector-idiom canonicalization early (after inlining) and before
+        // infer-AS / SROA to maximize scalarization opportunities.
+        // Specify 32 bytes since the largest HIP vector types are double4 or
+        // long4.
+        FPM.addPass(AMDGPUVectorIdiomCombinePass(/*MaxBytes=*/32));
+
         // Add infer address spaces pass to the opt pipeline after inlining
         // but before SROA to increase SROA opportunities.
         FPM.addPass(InferAddressSpacesPass());
@@ -953,6 +960,8 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
         if (EnableLowerModuleLDS)
           PM.addPass(AMDGPULowerModuleLDSPass(*this));
         if (Level != OptimizationLevel::O0) {
+          PM.addPass(createModuleToFunctionPassAdaptor(
+              AMDGPUVectorIdiomCombinePass(/*MaxBytes=*/32)));
           // We only want to run this with O2 or higher since inliner and SROA
           // don't run in O1.
           if (Level != OptimizationLevel::O1) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUVectorIdiom.cpp b/llvm/lib/Target/AMDGPU/AMDGPUVectorIdiom.cpp
new file mode 100644
index 0000000000000..2703fff9f4e9a
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUVectorIdiom.cpp
@@ -0,0 +1,519 @@
+//===- AMDGPUVectorIdiom.cpp ------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// AMDGPU-specific vector idiom canonicalizations to unblock SROA and
+// subsequent scalarization/vectorization.
+//
+// Motivation:
+// - HIP vector types are often modeled as structs and copied with memcpy.
+//   Address-level selects on such copies block SROA. Converting to value-level
+//   operations or splitting the CFG enables SROA to break aggregates, which
+//   unlocks scalarization/vectorization on AMDGPU.
+//
+// Example pattern:
+//   %src = select i1 %c, ptr %A, ptr %B
+//   call void @llvm.memcpy(ptr %dst, ptr %src, i32 16, i1 false)
+//
+// Objectives:
+// - Canonicalize small memcpy patterns where source or destination is a select
+// of pointers.
+// - Prefer value-level selects (on loaded values) over address-level selects
+// when safe.
+// - When speculation is unsafe, split the CFG to isolate each arm.
+//
+// Assumptions:
+// - Only handles non-volatile memcpy with constant length N where 0 < N <=
+// MaxBytes (default 32).
+// - Source and destination must be in the same address space.
+// - Speculative loads are allowed only if a conservative alignment check
+// passes.
+// - No speculative stores are introduced.
+//
+// Transformations:
+// - Source-select memcpy: attempt speculative loads -> value select -> single
+// store.
+//   Fallback is CFG split with two memcpy calls.
+// - Destination-select memcpy: always CFG split to avoid speculative stores.
+//
+// Run this pass early, before SROA.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUVectorIdiom.h"
+#include "AMDGPU.h"
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/Loads.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/PostDominators.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/Type.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+
+using namespace llvm;
+using namespace PatternMatch;
+
+#define DEBUG_TYPE "amdgpu-vector-idiom"
+
+namespace {
+
+static cl::opt<bool>
+    AMDGPUVectorIdiomEnable("amdgpu-vector-idiom-enable",
+                            cl::desc("Enable pass AMDGPUVectorIdiom"),
+                            cl::init(true));
+
+// Selects an integer or integer-vector element type matching NBytes, using the
+// minimum proven alignment to decide the widest safe element width.
+// Assumptions:
+// - Pointee types are opaque; the element choice is based solely on size and
+// alignment.
+// - Falls back to <N x i8> if wider lanes are not safe/aligned.
+static Type *getIntOrVecTypeForSize(uint64_t NBytes, LLVMContext &Ctx,
+                                    Align MinProvenAlign = Align(1)) {
+  auto CanUseI64 = [&]() { return MinProvenAlign >= Align(8); };
+  auto CanUseI32 = [&]() { return MinProvenAlign >= Align(4); };
+  auto CanUseI16 = [&]() { return MinProvenAlign >= Align(2); };
+
+  if (NBytes == 32 && CanUseI64())
+    return FixedVectorType::get(Type::getInt64Ty(Ctx), 4);
+
+  if ((NBytes % 4) == 0 && CanUseI32())
+    return FixedVectorType::get(Type::getInt32Ty(Ctx), NBytes / 4);
+
+  if ((NBytes % 2) == 0 && CanUseI16())
+    return FixedVectorType::get(Type::getInt16Ty(Ctx), NBytes / 2);
+
+  return FixedVectorType::get(Type::getInt8Ty(Ctx), NBytes);
+}
+
+static Align minAlign(Align A, Align B) { return A < B ? A : B; }
+
+// Checks if the underlying object of a memcpy operand is an alloca.
+// This helps focus on scratch memory optimizations by filtering out
+// memcpy operations that don't involve stack-allocated memory.
+static bool hasAllocaUnderlyingObject(Value *V) {
+  Value *Underlying = getUnderlyingObject(V);
+  return isa<AllocaInst>(Underlying);
+}
+
+// Checks if both pointer operands can be speculatively loaded for N bytes and
+// computes the minimum alignment to use.
+// Notes:
+// - Intentionally conservative: relies on isDereferenceablePointer and
+//   getOrEnforceKnownAlignment.
+// - AA/TLI are not used for deeper reasoning here.
+// Emits verbose LLVM_DEBUG logs explaining why speculation is disallowed.
+// Return false reasons include: either arm not dereferenceable or computed
+// known alignment < 1.
+static bool bothArmsSafeToSpeculateLoads(Value *A, Value *B, uint64_t Size,
+                                         Align &OutAlign, const DataLayout &DL,
+                                         AssumptionCache *AC,
+                                         const DominatorTree *DT,
+                                         Instruction *CtxI) {
+  APInt SizeAPInt(DL.getIndexTypeSizeInBits(A->getType()), Size);
+  if (!isDereferenceableAndAlignedPointer(B, Align(1), SizeAPInt, DL, CtxI, AC,
+                                          DT, nullptr)) {
+    LLVM_DEBUG(dbgs() << "[AMDGPUVectorIdiom] Not speculating loads: false arm "
+                      << "(B) not dereferenceable for " << Size
+                      << " bytes at align(1)\n");
+    LLVM_DEBUG(dbgs() << "    false arm (B) value: " << *B << '\n');
+    return false;
+  }
+
+  Align AlignB =
+      llvm::getOrEnforceKnownAlignment(B, Align(1), DL, nullptr, AC, DT);
+
+  if (AlignB < Align(1)) {
+    LLVM_DEBUG(dbgs() << "[AMDGPUVectorIdiom] Not speculating loads: known "
+                      << "alignment of false arm (B) < 1: " << AlignB.value()
+                      << '\n');
+    return false;
+  }
+
+  if (!isDereferenceableAndAlignedPointer(A, Align(1), SizeAPInt, DL, CtxI, AC,
+                                          DT, nullptr)) {
+    LLVM_DEBUG(dbgs() << "[AMDGPUVectorIdiom] Not speculating loads: true arm "
+                      << "(A) not dereferenceable for " << Size
+                      << " bytes at align(1)\n");
+    LLVM_DEBUG(dbgs() << "    true arm (A) value: " << *A << '\n');
+    return false;
+  }
+
+  Align AlignA =
+      llvm::getOrEnforceKnownAlignment(A, Align(1), DL, nullptr, AC, DT);
+
+  if (AlignA < Align(1)) {
+    LLVM_DEBUG(dbgs() << "[AMDGPUVectorIdiom] Not speculating loads: known "
+                      << "alignment of true arm (A) < 1: " << AlignA.value()
+                      << '\n');
+    return false;
+  }
+
+  OutAlign = minAlign(AlignA, AlignB);
+  LLVM_DEBUG(dbgs() << "[AMDGPUVectorIdiom] Speculative loads allowed: "
+                    << "minAlign=" << OutAlign.value() << '\n');
+  return true;
+}
+
+struct AMDGPUVectorIdiomImpl {
+  const unsigned MaxBytes;
+  bool CFGChanged = false;
+
+  AMDGPUVectorIdiomImpl(unsigned MaxBytes) : MaxBytes(MaxBytes) {}
+
+  // Rewrites memcpy when the source is a select of pointers. Prefers a
+  // value-level select (two loads + select + one store) if speculative loads
+  // are safe. Otherwise, falls back to a guarded CFG split with two memcpy
+  // calls. Assumptions:
+  // - Non-volatile, constant length, within MaxBytes.
+  // - Source and destination in the same address space.
+  bool transformSelectMemcpySource(MemCpyInst &MT, SelectInst &Sel,
+                                   const DataLayout &DL,
+                                   const DominatorTree *DT,
+                                   AssumptionCache *AC) {
+    LLVM_DEBUG(dbgs() << "[AMDGPUVectorIdiom] Considering memcpy(select-src): "
+                      << MT << '\n');
+    IRBuilder<> B(&MT);
+    Value *Dst = MT.getRawDest();
+    Value *A = Sel.getTrueValue();
+    Value *Bv = Sel.getFalseValue();
+
+    ConstantInt *LenCI = cast<ConstantInt>(MT.getLength());
+    uint64_t N = LenCI->getLimitedValue();
+
+    if (Sel.isVolatile()) {
+      LLVM_DEBUG(dbgs() << "[AMDGPUVectorIdiom] Not rewriting: Select marked "
+                        << "volatile (unexpected) in memcpy source\n");
+      return false;
+    }
+
+    // This is a null check - always use CFG split
+    Value *Cond = Sel.getCondition();
+    ICmpInst *ICmp = dyn_cast<ICmpInst>(Cond);
+    if (ICmp && ICmp->isEquality() &&
+        (isa<ConstantPointerNull>(ICmp->getOperand(0)) ||
+         isa<ConstantPointerNull>(ICmp->getOperand(1)))) {
+      splitCFGForMemcpy(MT, Sel.getCondition(), A, Bv, true);
+      LLVM_DEBUG(dbgs() << "[AMDGPUVectorIdiom] Null check pattern - "
+                           "using CFG split\n");
+      return true;
+    }
+
+    Align DstAlign = MaybeAlign(MT.getDestAlign()).valueOrOne();
+    Align AlignAB;
+    bool CanSpeculate = false;
+
+    const CallBase &CB = MT;
+    const unsigned SrcArgIdx = 1;
+    uint64_t DerefBytes = CB.getParamDereferenceableBytes(SrcArgIdx);
+    bool HasDerefOrNull =
+        CB.paramHasAttr(SrcArgIdx, Attribute::DereferenceableOrNull);
+    bool HasNonNull = CB.paramHasAttr(SrcArgIdx, Attribute::NonNull);
+    MaybeAlign SrcParamAlign = CB.getParamAlign(SrcArgIdx);
+    Align ProvenSrcAlign =
+        SrcParamAlign.value_or(MaybeAlign(MT.getSourceAlign()).valueOrOne());
+
+    if (DerefBytes > 0) {
+      LLVM_DEBUG(dbgs() << "[AMDGPUVectorIdiom] memcpy source param attrs: "
+                        << "dereferenceable(" << DerefBytes << ")"
+                        << (HasDerefOrNull ? " (or null)" : "")
+                        << (HasNonNull ? ", nonnull" : "") << ", align "
+                        << ProvenSrcAlign.value() << '\n');
+      if (DerefBytes >= N && (!HasDerefOrNull || HasNonNull)) {
+        LLVM_DEBUG(dbgs() << "[AMDGPUVectorIdiom] Using memcpy source operand "
+                          << "attributes at this use; accepting speculation\n");
+        CanSpeculate = true;
+        AlignAB = ProvenSrcAlign;
+      } else {
+        LLVM_DEBUG(
+            dbgs() << "[AMDGPUVectorIdiom] Source param attrs not strong "
+                   << "enough for speculation: need dereferenceable(" << N
+                   << ") and nonnull; got dereferenceable(" << DerefBytes << ")"
+                   << (HasDerefOrNull ? " (or null)" : "")
+                   << (HasNonNull ? ", nonnull" : "") << '\n');
+      }
+    } else {
+      LLVM_DEBUG(dbgs() << "[AMDGPUVectorIdiom] memcpy source param has no "
+                        << "dereferenceable bytes attribute; align "
+                        << ProvenSrcAlign.value() << '\n');
+    }
+    if (!CanSpeculate)
+      CanSpeculate =
+          bothArmsSafeToSpeculateLoads(A, Bv, N, AlignAB, DL, AC, DT, &MT);
+
+    if (CanSpeculate) {
+      Align MinAlign = std::min(AlignAB, DstAlign);
+      LLVM_DEBUG(dbgs() << "[AMDGPUVectorIdiom] Rewriting memcpy(select-src) "
+                        << "with value-level select; N=" << N
+                        << " minAlign=" << MinAlign.value() << '\n');
+
+      Type *Ty = getIntOrVecTypeForSize(N, B.getContext(), MinAlign);
+
+      LoadInst *LA = B.CreateAlignedLoad(Ty, A, MinAlign);
+      LoadInst *LB = B.CreateAlignedLoad(Ty, Bv, MinAlign);
+      Value *V = B.CreateSelect(Sel.getCondition(), LA, LB);
+
+      (void)B.CreateAlignedStore(V, Dst, DstAlign);
+
+      LLVM_DEBUG(dbgs() << "[AMDGPUVectorIdiom] Rewrote memcpy(select-src) to "
+                           "value-select loads/stores: "
+                        << MT << '\n');
+      MT.eraseFromParent();
+      return true;
+    }
+
+    LLVM_DEBUG(dbgs() << "[AMDGPUVectorIdiom] Falling back to CFG split for "
+                      << "memcpy(select-src); speculation unsafe\n");
+    splitCFGForMemcpy(MT, Sel.getCondition(), A, Bv, true);
+    LLVM_DEBUG(
+        dbgs()
+        << "[AMDGPUVectorIdiom] Rewrote memcpy(select-src) by CFG split\n");
+    return true;
+  }
+
+  // Rewrites memcpy when the destination is a select of pointers. To avoid
+  // speculative stores, always splits the CFG and emits a memcpy per branch.
+  // Assumptions mirror the source case.
+  bool transformSelectMemcpyDest(MemCpyInst &MT, SelectInst &Sel) {
+    Value *DA = Sel.getTrueValue();
+    Value *DB = Sel.getFalseValue();
+    LLVM_DEBUG(dbgs() << "[AMDGPUVectorIdiom] Rewriting memcpy(select-dst) via "
+                      << "CFG split to avoid speculative stores: " << MT
+                      << '\n');
+
+    splitCFGForMemcpy(MT, Sel.getCondition(), DA, DB, false);
+    LLVM_DEBUG(
+        dbgs()
+        << "[AMDGPUVectorIdiom] Rewrote memcpy(select-dst) by CFG split\n");
+    return true;
+  }
+
+  // Splits the CFG around a memcpy whose source or destination depends on a
+  // condition. Clones memcpy in then/else using TruePtr/FalsePtr and rejoins.
+  // Assumptions:
+  // - MT has constant length and is non-volatile.
+  // - TruePtr/FalsePtr are correct replacements for the selected operand.
+  void splitCFGForMemcpy(MemCpyInst &MT, Value *Cond, Value *TruePtr,
+                         Value *FalsePtr, bool IsSource) {
+    CFGChanged = true;
+
+    Function *F = MT.getFunction();
+    BasicBlock *Cur = MT.getParent();
+    BasicBlock *ThenBB = BasicBlock::Create(F->getContext(), "memcpy.then", F);
+    BasicBlock *ElseBB = BasicBlock::Create(F->getContext(), "memcpy.else", F);
+    BasicBlock *JoinBB =
+        Cur->splitBasicBlock(BasicBlock::iterator(&MT), "memcpy.join");
+
+    Cur->getTerminator()->eraseFromParent();
+    IRBuilder<> B(Cur);
+    B.CreateCondBr(Cond, ThenBB, ElseBB);
+
+    ConstantInt *LenCI = cast<ConstantInt>(MT.getLength());
+
+    IRBuilder<> BT(ThenBB);
+    if (IsSource) {
+      (void)BT.CreateMemCpy(MT.getRawDest(), MT.getDestAlign(), TruePtr,
+                            MT.getSourceAlign(), LenCI, MT.isVolatile());
+    } else {
+      (void)BT.CreateMemCpy(TruePtr, MT.getDestAlign(), MT.getRawSource(),
+                            MT.getSourceAlign(), LenCI, MT.isVolatile());
+    }
+    BT.CreateBr(JoinBB);
+
+    IRBuilder<> BE(ElseBB);
+    if (IsSource) {
+      (void)BE.CreateMemCpy(MT.getRawDest(), MT.getDestAlign(), FalsePtr,
+                            MT.getSourceAlign(), LenCI, MT.isVolatile());
+    } else {
+      (void)BE.CreateMemCpy(FalsePtr, MT.getDestAlign(), MT.getRawSource(),
+                            MT.getSourceAlign(), LenCI, MT.isVolatile());
+    }
+    BE.CreateBr(JoinBB);
+
+    MT.eraseFromParent();
+  }
+};
+
+} // end anonymous namespace
+
+AMDGPUVectorIdiomCombinePass::AMDGPUVectorIdiomCombinePass(unsigned MaxBytes)
+    : MaxBytes(MaxBytes) {}
+
+// Pass driver that locates small, constant-size, non-volatile memcpy calls
+// where source or destination is a select in the same address space. Applies
+// the source/destination transforms described above. Intended to run early to
+// maximize SROA and subsequent optimizations.
+PreservedAnalyses
+AMDGPUVectorIdiomCombinePass::run(Function &F, FunctionAnalysisManager &FAM) {
+  const DataLayout &DL = F.getParent()->getDataLayout();
+  auto &DT = FAM.getResult<DominatorTreeAnalysis>(F);
+  auto &AC = FAM.getResult<AssumptionAnalysis>(F);
+
+  if (!AMDGPUVectorIdiomEnable)
+    return PreservedAnalyses::all();
+
+  SmallVector<MemCpyInst *, 8> Worklist;
+  for (Instruction &I : instructions(F)) {
+    if (auto *MC = dyn_cast<MemCpyInst>(&I))
+      Worklist.push_back(MC);
+  }
+
+  bool Changed = false;
+  AMDGPUVectorIdiomImpl Impl(MaxBytes);
+
+  for (MemCpyInst *MT : Worklist) {
+    Value *Dst = MT->getRawDest();
+    Value *Src = MT->getRawSource();
+    if (!isa<SelectInst>(Src) && !isa<SelectInst>(Dst))
+      continue;
+
+    LLVM_DEBUG({
+      Value *DstV = MT->getRawDest();
+      Value *SrcV = MT->getRawSource();
+      unsigned DstAS = cast<PointerType>(DstV->getType())->getAddressSpace();
+      unsigned SrcAS = cast<PointerType>(SrcV->getType())->getAddressSpace();
+      Value *LenV = MT->getLength();
+
+      auto dumpPtrForms = [&](StringRef Label, Value *V) {
+        dbgs() << "      " << Label << ": " << *V << '\n';
+
+        Value *StripCasts = V->stripPointerCasts();
+        if (StripCasts != V)
+          dbgs() << "        - stripCasts: " << *StripCasts << '\n';
+        else
+          dbgs() << "        - stripCasts: (no change)\n";
+
+        Value *Underlying = getUnderlyingObject(V);
+        if (Underlying != V)
+          dbgs() << "        - underlying: " << *Underlying << '\n';
+        else
+          dbgs() << "        - underlying: (no change)\n";
+      };
+
+      auto dumpSelect = [&](StringRef Which, Value *V) {
+        if (auto *SI = dyn_cast<SelectInst>(V)) {
+          dbgs() << "  - " << Which << " is Select: " << *SI << '\n';
+          dbgs() << "      cond: " << *SI->getCondition() << '\n';
+          Value *T = SI->getTrueValue();
+          Value *Fv = SI->getFalseValue();
+          dumpPtrForms("true", T);
+          dumpPtrForms("false", Fv);
+          dbgs() << "      trueIsAlloca=" << (hasAllocaUnderlyingObject(T) ? "true" : "false") << '\n';
+          dbgs() << "      falseIsAlloca=" << (hasAllocaUnderlyingObject(Fv) ? "true" : "false") << '\n';
+        }
+      };
+
+      dbgs() << "[AMDGPUVectorIdiom] Found memcpy: " << *MT << '\n'
+             << "  in function: " << F.getName() << '\n'
+             << "  - volatile=" << (MT->isVolatile() ? "true" : "false") << '\n'
+             << "  - sameAS=" << (DstAS == SrcAS ? "true" : "false")
+             << " (dstAS=" << DstAS << ", srcAS=" << SrcAS << ")\n"
+             << "  - constLen=" << (isa<ConstantInt>(LenV) ? "true" : "false");
+      if (auto *LCI = dyn_cast<ConstantInt>(LenV))
+        dbgs() << " (N=" << LCI->getLimitedValue() << ")";
+      dbgs() << '\n'
+             << "  - srcIsSelect=" << (isa<SelectInst>(SrcV) ? "true" : "false")
+             << '\n'
+             << "  - dstIsSelect=" << (isa<SelectInst>(DstV) ? "true" : "false")
+             << '\n'
+             << "  - srcIsAlloca=" << (hasAllocaUnderlyingObject(SrcV) ? "true" : "false")
+             << '\n'
+             << "  - dstIsAlloca=" << (hasAllocaUnderlyingObject(DstV) ? "true" : "false")
+             << '\n';
+
+      dumpSelect("src", SrcV);
+      dumpSelect("dst", DstV);
+    });
+
+    if (MT->isVolatile()) {
+      LLVM_DEBUG(dbgs() << "[AMDGPUVectorIdiom] Skip: memcpy is volatile\n");
+      continue;
+    }
+
+    ConstantInt *LenCI = dyn_cast<ConstantInt>(MT->getLength());
+    if (!LenCI) {
+      LLVM_DEBUG(dbgs() << "[AMDGPUVectorIdiom] Skip: memcpy length is not a "
+                        << "constant integer\n");
+      continue;
+    }
+
+    uint64_t N = LenCI->getLimitedValue();
+    if (N == 0 || N > MaxBytes) {
+      LLVM_DEBUG(dbgs() << "[AMDGPUVectorIdiom] Skip: memcpy size out of range "
+                        << "(N=" << N << ", MaxBytes=" << MaxBytes << ")\n");
+      continue;
+    }
+
+    unsigned DstAS = cast<PointerType>(Dst->getType())->getAddressSpace();
+    unsigned SrcAS = cast<PointerType>(Src->getType())->getAddressSpace();
+    if (DstAS != SrcAS) {
+      LLVM_DEBUG(dbgs() << "[AMDGPUVectorIdiom] Skip: address space mismatch "
+                        << "(dstAS=" << DstAS << ", srcAS=" << SrcAS << ")\n");
+      continue;
+    }
+
+    // Check if we have select instructions and if their operands are alloca-based
+    bool ShouldTransform = false;
+    if (auto *Sel = dyn_cast<SelectInst>(Src)) {
+      bool TrueIsAlloca = hasAllocaUnderlyingObject(Sel->getTrueValue());
+      bool FalseIsAlloca = hasAllocaUnderlyingObject(Sel->getFalseValue());
+      if (TrueIsAlloca || FalseIsAlloca) {
+        ShouldTransform = true;
+        Changed |= Impl.transformSelectMemcpySource(*MT, *Sel, DL, &DT, &AC);
+      } else {
+        LLVM_DEBUG(dbgs() << "[AMDGPUVectorIdiom] Skip: select source operands "
+                          << "are not alloca-based\n");
+      }
+      continue;
+    }
+    if (auto *Sel = dyn_cast<SelectInst>(Dst)) {
+      bool TrueIsAlloca = hasAllocaUnderlyingObject(Sel->getTrueValue());
+      bool FalseIsAlloca = hasAllocaUnderlyingObject(Sel->getFalseValue());
+      if (TrueIsAlloca || FalseIsAlloca) {
+        ShouldTransform = true;
+        Changed |= Impl.transformSelectMemcpyDest(*MT, *Sel);
+      } else {
+        LLVM_DEBUG(dbgs() << "[AMDGPUVectorIdiom] Skip: select destination operands "
+                          << "are not alloca-based\n");
+      }
+      continue;
+    }
+
+    LLVM_DEBUG(dbgs() << "[AMDGPUVectorIdiom] Skip: neither source nor "
+                      << "destination is a select of pointers\n");
+  }
+
+  if (!Changed)
+    return PreservedAnalyses::all();
+
+  // Be conservative: preserve only analyses we know remain valid.
+  PreservedAnalyses PA;
+  PA.preserve<AssumptionAnalysis>();
+  PA.preserve<TargetLibraryAnalysis>();
+  PA.preserve<TargetIRAnalysis>();
+
+  // If we didn't change the CFG, we can keep DT/LI/PostDT.
+  if (!Impl.CFGChanged) {
+    PA.preserve<DominatorTreeAnalysis>();
+    PA.preserve<LoopAnalysis>();
+    PA.preserve<PostDominatorTreeAnalysis>();
+  }
+
+  return PA;
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUVectorIdiom.h b/llvm/lib/Target/AMDGPU/AMDGPUVectorIdiom.h
new file mode 100644
index 0000000000000..339a604a092a0
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUVectorIdiom.h
@@ -0,0 +1,43 @@
+//===- AMDGPUVectorIdiom.h --------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// AMDGPU-specific vector idiom canonicalizations to unblock SROA and
+// subsequent scalarization/vectorization.
+//
+// This pass rewrites memcpy with select-fed operands into either:
+//  - a value-level select (two loads + select + store), when safe to
+//    speculatively load both arms, or
+//  - a conservative CFG split around the condition to isolate each arm.
+//
+// Run this pass early, before SROA.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPVECTORIDIOM_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPVECTORIDIOM_H
+
+#include "AMDGPU.h"
+#include "llvm/IR/PassManager.h"
+
+namespace llvm {
+
+class AMDGPUVectorIdiomCombinePass
+    : public PassInfoMixin<AMDGPUVectorIdiomCombinePass> {
+  unsigned MaxBytes;
+
+public:
+  /// \p MaxBytes is max memcpy size (in bytes) to transform in
+  /// AMDGPUVectorIdiom
+  AMDGPUVectorIdiomCombinePass(unsigned MaxBytes);
+
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM);
+};
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPVECTORIDIOM_H
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index aae56eef73edd..a90512d77c288 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -119,6 +119,7 @@ add_llvm_target(AMDGPUCodeGen
   AMDGPUTargetTransformInfo.cpp
   AMDGPUWaitSGPRHazards.cpp
   AMDGPUUnifyDivergentExitNodes.cpp
+  AMDGPUVectorIdiom.cpp
   R600MachineCFGStructurizer.cpp
   GCNCreateVOPD.cpp
   GCNDPPCombine.cpp
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-vector-idiom-memcpy-select.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-vector-idiom-memcpy-select.ll
new file mode 100644
index 0000000000000..b866ba643562e
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-vector-idiom-memcpy-select.ll
@@ -0,0 +1,391 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -amdgpu-vector-idiom-enable -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-vector-idiom -S %s | FileCheck %s
+
+; This test verifies the AMDGPUVectorIdiomCombinePass transforms:
+; 1) memcpy with select-fed source into a value-level select between two loads,
+;    followed by one store (when it's safe to speculate both loads).
+; 2) memcpy with select-fed destination into a control-flow split with two memcpys.
+
+@G0 = addrspace(1) global [4 x i32] zeroinitializer, align 16
+@G1 = addrspace(1) global [4 x i32] zeroinitializer, align 16
+
+declare void @llvm.memcpy.p0.p0.i64(ptr nocapture writeonly, ptr nocapture readonly, i64, i1 immarg)
+declare void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) nocapture writeonly, ptr addrspace(5) nocapture readonly, i64, i1 immarg)
+
+; -----------------------------------------------------------------------------
+; Source is a select. Expect value-level select of two <4 x i32> loads
+; and a single store, with no remaining memcpy.
+;
+define amdgpu_kernel void @value_select_src(i1 %cond) {
+; CHECK-LABEL: define amdgpu_kernel void @value_select_src(
+; CHECK-SAME: i1 [[COND:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[PA:%.*]] = alloca [4 x i32], align 16, addrspace(5)
+; CHECK-NEXT:    [[PB:%.*]] = alloca [4 x i32], align 16, addrspace(5)
+; CHECK-NEXT:    [[DST:%.*]] = alloca [4 x i32], align 16, addrspace(5)
+; CHECK-NEXT:    [[SRC:%.*]] = select i1 [[COND]], ptr addrspace(5) [[PA]], ptr addrspace(5) [[PB]]
+; CHECK-NEXT:    [[LA:%.*]] = load <4 x i32>, ptr addrspace(5) [[PA]], align 16
+; CHECK-NEXT:    [[LB:%.*]] = load <4 x i32>, ptr addrspace(5) [[PB]], align 16
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[COND]], <4 x i32> [[LA]], <4 x i32> [[LB]]
+; CHECK-NEXT:    store <4 x i32> [[SEL]], ptr addrspace(5) [[DST]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  ; Pointers to two 16-byte aligned buffers using alloca.
+  %pa = alloca [4 x i32], align 16, addrspace(5)
+  %pb = alloca [4 x i32], align 16, addrspace(5)
+  %dst = alloca [4 x i32], align 16, addrspace(5)
+  %src = select i1 %cond, ptr addrspace(5) %pa, ptr addrspace(5) %pb
+
+  ; Provide explicit operand alignments so the pass can emit an aligned store.
+  call void @llvm.memcpy.p5.p5.i64(
+  ptr addrspace(5) align 16 %dst,
+  ptr addrspace(5) align 16 %src,
+  i64 16, i1 false)
+
+  ret void
+}
+
+; -----------------------------------------------------------------------------
+; Destination is a select. Expect CFG split with two memcpys guarded
+; by a branch (we do not speculate stores in this pass).
+;
+define amdgpu_kernel void @dest_select_cfg_split(i1 %cond) {
+; CHECK-LABEL: define amdgpu_kernel void @dest_select_cfg_split(
+; CHECK-SAME: i1 [[COND:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[DA:%.*]] = alloca [4 x i32], align 16, addrspace(5)
+; CHECK-NEXT:    [[DB:%.*]] = alloca [4 x i32], align 16, addrspace(5)
+; CHECK-NEXT:    [[SRC:%.*]] = alloca [4 x i32], align 16, addrspace(5)
+; CHECK-NEXT:    [[DST:%.*]] = select i1 [[COND]], ptr addrspace(5) [[DA]], ptr addrspace(5) [[DB]]
+; CHECK-NEXT:    br i1 [[COND]], label %[[MEMCPY_THEN:.*]], label %[[MEMCPY_ELSE:.*]]
+; CHECK:       [[MEMCPY_JOIN:.*]]:
+; CHECK-NEXT:    ret void
+; CHECK:       [[MEMCPY_THEN]]:
+; CHECK-NEXT:    call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) [[DA]], ptr addrspace(5) [[SRC]], i64 16, i1 false)
+; CHECK-NEXT:    br label %[[MEMCPY_JOIN]]
+; CHECK:       [[MEMCPY_ELSE]]:
+; CHECK-NEXT:    call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) [[DB]], ptr addrspace(5) [[SRC]], i64 16, i1 false)
+; CHECK-NEXT:    br label %[[MEMCPY_JOIN]]
+;
+entry:
+  %da = alloca [4 x i32], align 16, addrspace(5)
+  %db = alloca [4 x i32], align 16, addrspace(5)
+  %src = alloca [4 x i32], align 16, addrspace(5)
+  %dst = select i1 %cond, ptr addrspace(5) %da, ptr addrspace(5) %db
+  call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) %dst, ptr addrspace(5) %src, i64 16, i1 false)
+  ret void
+}
+
+; -----------------------------------------------------------------------------
+; Source is a select, 4 x double (32 bytes).
+; Expect value-level select of two <4 x i64> loads and a single store, no memcpy.
+;
+@G2 = addrspace(1) global [4 x double] zeroinitializer, align 32
+@G3 = addrspace(1) global [4 x double] zeroinitializer, align 32
+define amdgpu_kernel void @value_select_src_4xd(i1 %cond) {
+; CHECK-LABEL: define amdgpu_kernel void @value_select_src_4xd(
+; CHECK-SAME: i1 [[COND:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[PA:%.*]] = alloca [4 x double], align 32, addrspace(5)
+; CHECK-NEXT:    [[PB:%.*]] = alloca [4 x double], align 32, addrspace(5)
+; CHECK-NEXT:    [[DST:%.*]] = alloca [4 x double], align 32, addrspace(5)
+; CHECK-NEXT:    [[SRC:%.*]] = select i1 [[COND]], ptr addrspace(5) [[PA]], ptr addrspace(5) [[PB]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i64>, ptr addrspace(5) [[PA]], align 32
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr addrspace(5) [[PB]], align 32
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[COND]], <4 x i64> [[TMP0]], <4 x i64> [[TMP1]]
+; CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr addrspace(5) [[DST]], align 32
+; CHECK-NEXT:    ret void
+;
+entry:
+  %pa = alloca [4 x double], align 32, addrspace(5)
+  %pb = alloca [4 x double], align 32, addrspace(5)
+  %dst = alloca [4 x double], align 32, addrspace(5)
+  %src = select i1 %cond, ptr addrspace(5) %pa, ptr addrspace(5) %pb
+
+  call void @llvm.memcpy.p5.p5.i64(
+  ptr addrspace(5) align 32 %dst,
+  ptr addrspace(5) align 32 %src,
+  i64 32, i1 false)
+
+  ret void
+}
+
+; -----------------------------------------------------------------------------
+; Source is a select, 3 x char (3 bytes).
+; Expect value-level select using <3 x i8> loads/stores, no memcpy.
+;
+@G4 = addrspace(1) global [3 x i8] zeroinitializer, align 1
+@G5 = addrspace(1) global [3 x i8] zeroinitializer, align 1
+define amdgpu_kernel void @value_select_src_3xc(i1 %cond) {
+; CHECK-LABEL: define amdgpu_kernel void @value_select_src_3xc(
+; CHECK-SAME: i1 [[COND:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[PA:%.*]] = alloca [3 x i8], align 1, addrspace(5)
+; CHECK-NEXT:    [[PB:%.*]] = alloca [3 x i8], align 1, addrspace(5)
+; CHECK-NEXT:    [[DST:%.*]] = alloca [3 x i8], align 1, addrspace(5)
+; CHECK-NEXT:    [[SRC:%.*]] = select i1 [[COND]], ptr addrspace(5) [[PA]], ptr addrspace(5) [[PB]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <3 x i8>, ptr addrspace(5) [[PA]], align 1
+; CHECK-NEXT:    [[TMP1:%.*]] = load <3 x i8>, ptr addrspace(5) [[PB]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[COND]], <3 x i8> [[TMP0]], <3 x i8> [[TMP1]]
+; CHECK-NEXT:    store <3 x i8> [[TMP2]], ptr addrspace(5) [[DST]], align 1
+; CHECK-NEXT:    ret void
+;
+entry:
+  %pa = alloca [3 x i8], align 1, addrspace(5)
+  %pb = alloca [3 x i8], align 1, addrspace(5)
+  %dst = alloca [3 x i8], align 1, addrspace(5)
+  %src = select i1 %cond, ptr addrspace(5) %pa, ptr addrspace(5) %pb
+
+  call void @llvm.memcpy.p5.p5.i64(
+  ptr addrspace(5) align 1 %dst,
+  ptr addrspace(5) align 1 %src,
+  i64 3, i1 false)
+
+  ret void
+}
+
+; -----------------------------------------------------------------------------
+; Source is a select with constant expression GEP arms.
+; Expect value-level select: two loads + select + store, no memcpy.
+;
+@GEPA = addrspace(1) global [4 x i32] zeroinitializer, align 16
+@GEPB = addrspace(1) global [4 x i32] zeroinitializer, align 16
+define amdgpu_kernel void @value_select_src_constexpr_gep(ptr addrspace(1) %dst, i1 %cond) {
+; CHECK-LABEL: define amdgpu_kernel void @value_select_src_constexpr_gep(
+; CHECK-SAME: ptr addrspace(1) [[DST:%.*]], i1 [[COND:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[SRC:%.*]] = select i1 [[COND]], ptr addrspace(1) @GEPA, ptr addrspace(1) @GEPB
+; CHECK-NEXT:    call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) align 16 [[DST]], ptr addrspace(1) align 16 [[SRC]], i64 16, i1 false)
+; CHECK-NEXT:    ret void
+;
+entry:
+  ; Constant expression GEPs to the base elements
+  %src = select i1 %cond,
+  ptr addrspace(1) getelementptr inbounds ([4 x i32], ptr addrspace(1) @GEPA, i64 0, i64 0),
+  ptr addrspace(1) getelementptr inbounds ([4 x i32], ptr addrspace(1) @GEPB, i64 0, i64 0)
+
+  call void @llvm.memcpy.p1.p1.i64(
+  ptr addrspace(1) align 16 %dst,
+  ptr addrspace(1) align 16 %src,
+  i64 16, i1 false)
+
+  ret void
+}
+; -----------------------------------------------------------------------------
+; Destination is a select with constant expression GEP arms.
+; Expect CFG split with two memcpys.
+;
+define amdgpu_kernel void @dest_select_constexpr_gep(ptr addrspace(1) %src, i1 %cond) {
+; CHECK-LABEL: define amdgpu_kernel void @dest_select_constexpr_gep(
+; CHECK-SAME: ptr addrspace(1) [[SRC:%.*]], i1 [[COND:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[DST:%.*]] = select i1 [[COND]], ptr addrspace(1) @GEPA, ptr addrspace(1) @GEPB
+; CHECK-NEXT:    call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) [[DST]], ptr addrspace(1) [[SRC]], i64 16, i1 false)
+; CHECK-NEXT:    ret void
+;
+entry:
+  %dst = select i1 %cond,
+  ptr addrspace(1) getelementptr inbounds ([4 x i32], ptr addrspace(1) @GEPA, i64 0, i64 0),
+  ptr addrspace(1) getelementptr inbounds ([4 x i32], ptr addrspace(1) @GEPB, i64 0, i64 0)
+
+  call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) %dst, ptr addrspace(1) %src, i64 16, i1 false)
+  ret void
+}
+; -----------------------------------------------------------------------------
+; Source is a select where one arm is null.
+; Expect CFG split (no speculative loads).
+;
+@GN = addrspace(1) global [4 x i32] zeroinitializer, align 16
+define amdgpu_kernel void @src_select_null_arm(ptr addrspace(1) %dst, i1 %cond) {
+; CHECK-LABEL: define amdgpu_kernel void @src_select_null_arm(
+; CHECK-SAME: ptr addrspace(1) [[DST:%.*]], i1 [[COND:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[NONNULL:%.*]] = getelementptr inbounds [4 x i32], ptr addrspace(1) @GN, i64 0, i64 0
+; CHECK-NEXT:    [[SRC:%.*]] = select i1 [[COND]], ptr addrspace(1) [[NONNULL]], ptr addrspace(1) null
+; CHECK-NEXT:    call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) [[DST]], ptr addrspace(1) [[SRC]], i64 16, i1 false)
+; CHECK-NEXT:    ret void
+;
+entry:
+  %nonnull = getelementptr inbounds [4 x i32], ptr addrspace(1) @GN, i64 0, i64 0
+  %src = select i1 %cond, ptr addrspace(1) %nonnull, ptr addrspace(1) null
+
+  call void @llvm.memcpy.p1.p1.i64(
+  ptr addrspace(1) %dst, ptr addrspace(1) %src, i64 16, i1 false)
+  ret void
+}
+; -----------------------------------------------------------------------------
+; Destination is a select where one arm is null.
+; Expect CFG split (no speculative stores).
+;
+define amdgpu_kernel void @dst_select_null_arm(ptr addrspace(1) %src, i1 %cond) {
+; CHECK-LABEL: define amdgpu_kernel void @dst_select_null_arm(
+; CHECK-SAME: ptr addrspace(1) [[SRC:%.*]], i1 [[COND:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[DST:%.*]] = select i1 [[COND]], ptr addrspace(1) null, ptr addrspace(1) @GN
+; CHECK-NEXT:    call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) [[DST]], ptr addrspace(1) [[SRC]], i64 16, i1 false)
+; CHECK-NEXT:    ret void
+;
+entry:
+  %dst = select i1 %cond, ptr addrspace(1) null,
+  ptr addrspace(1) getelementptr inbounds ([4 x i32], ptr addrspace(1) @GN, i64 0, i64 0)
+
+  call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) %dst, ptr addrspace(1) %src, i64 16, i1 false)
+  ret void
+}
+; -----------------------------------------------------------------------------
+; Source is a select where one arm is poison.
+; Expect CFG split (speculative use of poison is not allowed).
+;
+@GP = addrspace(1) global [4 x i32] zeroinitializer, align 16
+define amdgpu_kernel void @src_select_poison_arm(ptr addrspace(1) %dst, i1 %cond) {
+; CHECK-LABEL: define amdgpu_kernel void @src_select_poison_arm(
+; CHECK-SAME: ptr addrspace(1) [[DST:%.*]], i1 [[COND:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[NONNULL:%.*]] = getelementptr inbounds [4 x i32], ptr addrspace(1) @GP, i64 0, i64 0
+; CHECK-NEXT:    [[SRC:%.*]] = select i1 [[COND]], ptr addrspace(1) [[NONNULL]], ptr addrspace(1) poison
+; CHECK-NEXT:    call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) [[DST]], ptr addrspace(1) [[SRC]], i64 16, i1 false)
+; CHECK-NEXT:    ret void
+;
+entry:
+  %nonnull = getelementptr inbounds [4 x i32], ptr addrspace(1) @GP, i64 0, i64 0
+  %src = select i1 %cond, ptr addrspace(1) %nonnull, ptr addrspace(1) poison
+
+  call void @llvm.memcpy.p1.p1.i64(
+  ptr addrspace(1) %dst, ptr addrspace(1) %src, i64 16, i1 false)
+  ret void
+}
+; -----------------------------------------------------------------------------
+; Destination is a select where one arm is poison.
+; Expect CFG split.
+;
+define amdgpu_kernel void @dst_select_poison_arm(ptr addrspace(1) %src, i1 %cond) {
+; CHECK-LABEL: define amdgpu_kernel void @dst_select_poison_arm(
+; CHECK-SAME: ptr addrspace(1) [[SRC:%.*]], i1 [[COND:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[DST:%.*]] = select i1 [[COND]], ptr addrspace(1) poison, ptr addrspace(1) @GP
+; CHECK-NEXT:    call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) [[DST]], ptr addrspace(1) [[SRC]], i64 16, i1 false)
+; CHECK-NEXT:    ret void
+;
+entry:
+  %dst = select i1 %cond, ptr addrspace(1) poison,
+  ptr addrspace(1) getelementptr inbounds ([4 x i32], ptr addrspace(1) @GP, i64 0, i64 0)
+
+  call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) %dst, ptr addrspace(1) %src, i64 16, i1 false)
+  ret void
+}
+; -----------------------------------------------------------------------------
+; Non-constant memcpy length: the pass should not transform.
+; Expect: memcpy remains as-is (no load/select/store, no CFG split).
+;
+define amdgpu_kernel void @memcpy_nonconst_length_src_select(ptr addrspace(1) %dst,
+; CHECK-LABEL: define amdgpu_kernel void @memcpy_nonconst_length_src_select(
+; CHECK-SAME: ptr addrspace(1) [[DST:%.*]], ptr addrspace(1) [[PA:%.*]], ptr addrspace(1) [[PB:%.*]], i1 [[COND:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[SRC:%.*]] = select i1 [[COND]], ptr addrspace(1) [[PA]], ptr addrspace(1) [[PB]]
+; CHECK-NEXT:    call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) [[DST]], ptr addrspace(1) [[SRC]], i64 [[N]], i1 false)
+; CHECK-NEXT:    ret void
+;
+  ptr addrspace(1) %pa,
+  ptr addrspace(1) %pb,
+  i1 %cond, i64 %n) {
+entry:
+  %src = select i1 %cond, ptr addrspace(1) %pa, ptr addrspace(1) %pb
+  call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) %dst,
+  ptr addrspace(1) %src,
+  i64 %n, i1 false)
+  ret void
+}
+; -----------------------------------------------------------------------------
+; Non-constant memcpy length with destination select: pass should not transform.
+; Expect: memcpy remains, no CFG split.
+;
+define amdgpu_kernel void @memcpy_nonconst_length_dst_select(ptr addrspace(1) %da,
+; CHECK-LABEL: define amdgpu_kernel void @memcpy_nonconst_length_dst_select(
+; CHECK-SAME: ptr addrspace(1) [[DA:%.*]], ptr addrspace(1) [[DB:%.*]], ptr addrspace(1) [[SRC:%.*]], i1 [[COND:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[DST:%.*]] = select i1 [[COND]], ptr addrspace(1) [[DA]], ptr addrspace(1) [[DB]]
+; CHECK-NEXT:    call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) [[DST]], ptr addrspace(1) [[SRC]], i64 [[N]], i1 false)
+; CHECK-NEXT:    ret void
+;
+  ptr addrspace(1) %db,
+  ptr addrspace(1) %src,
+  i1 %cond, i64 %n) {
+entry:
+  %dst = select i1 %cond, ptr addrspace(1) %da, ptr addrspace(1) %db
+  call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) %dst,
+  ptr addrspace(1) %src,
+  i64 %n, i1 false)
+  ret void
+}
+; -----------------------------------------------------------------------------
+; "Non-constant source" scenario: select arms are function args (not globals).
+; No strong dereferenceable/align attrs -> speculation should be unsafe,
+; so the pass should split CFG and materialize two memcpys.
+;
+define amdgpu_kernel void @memcpy_src_select_arg_arms_cfg_split(ptr addrspace(1) %dst,
+; CHECK-LABEL: define amdgpu_kernel void @memcpy_src_select_arg_arms_cfg_split(
+; CHECK-SAME: ptr addrspace(1) [[DST:%.*]], ptr addrspace(1) [[PA:%.*]], ptr addrspace(1) [[PB:%.*]], i1 [[COND:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[SRC:%.*]] = select i1 [[COND]], ptr addrspace(1) [[PA]], ptr addrspace(1) [[PB]]
+; CHECK-NEXT:    call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) [[DST]], ptr addrspace(1) [[SRC]], i64 16, i1 false)
+; CHECK-NEXT:    ret void
+;
+  ptr addrspace(1) %pa,
+  ptr addrspace(1) %pb,
+  i1 %cond) {
+entry:
+  %src = select i1 %cond, ptr addrspace(1) %pa, ptr addrspace(1) %pb
+  call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) %dst,
+  ptr addrspace(1) %src,
+  i64 16, i1 false)
+  ret void
+}
+; -----------------------------------------------------------------------------
+; memmove should be ignored by the pass even with select-fed source/dest.
+; Expect: memmove remains as-is (no CFG split, no speculative transform).
+;
+declare void @llvm.memmove.p1.p1.i64(ptr addrspace(1) nocapture writeonly,
+  ptr addrspace(1) nocapture readonly,
+  i64, i1 immarg)
+
+define amdgpu_kernel void @memmove_ignored_src_select(ptr addrspace(1) %dst,
+; CHECK-LABEL: define amdgpu_kernel void @memmove_ignored_src_select(
+; CHECK-SAME: ptr addrspace(1) [[DST:%.*]], ptr addrspace(1) [[PA:%.*]], ptr addrspace(1) [[PB:%.*]], i1 [[COND:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[SRC:%.*]] = select i1 [[COND]], ptr addrspace(1) [[PA]], ptr addrspace(1) [[PB]]
+; CHECK-NEXT:    call void @llvm.memmove.p1.p1.i64(ptr addrspace(1) [[DST]], ptr addrspace(1) [[SRC]], i64 16, i1 false)
+; CHECK-NEXT:    ret void
+;
+  ptr addrspace(1) %pa,
+  ptr addrspace(1) %pb,
+  i1 %cond) {
+entry:
+  %src = select i1 %cond, ptr addrspace(1) %pa, ptr addrspace(1) %pb
+  call void @llvm.memmove.p1.p1.i64(ptr addrspace(1) %dst,
+  ptr addrspace(1) %src,
+  i64 16, i1 false)
+  ret void
+}
+; -----------------------------------------------------------------------------
+; memset should be ignored by the pass, even if destination is a select.
+; Expect: memset remains as-is (no CFG split).
+;
+declare void @llvm.memset.p1.i64(ptr addrspace(1) nocapture writeonly,
+  i8, i64, i1 immarg)
+
+define amdgpu_kernel void @memset_ignored_dst_select(ptr addrspace(1) %da,
+; CHECK-LABEL: define amdgpu_kernel void @memset_ignored_dst_select(
+; CHECK-SAME: ptr addrspace(1) [[DA:%.*]], ptr addrspace(1) [[DB:%.*]], i1 [[COND:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[DST:%.*]] = select i1 [[COND]], ptr addrspace(1) [[DA]], ptr addrspace(1) [[DB]]
+; CHECK-NEXT:    call void @llvm.memset.p1.i64(ptr addrspace(1) [[DST]], i8 0, i64 16, i1 false)
+; CHECK-NEXT:    ret void
+;
+  ptr addrspace(1) %db,
+  i1 %cond) {
+entry:
+  %dst = select i1 %cond, ptr addrspace(1) %da, ptr addrspace(1) %db
+  call void @llvm.memset.p1.i64(ptr addrspace(1) %dst, i8 0, i64 16, i1 false)
+  ret void
+}