diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index f7384ced3e23e4..af5ea1ce5f4597 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -4850,33 +4850,43 @@ SDValue DAGTypeLegalizer::WidenVecRes_BITCAST(SDNode *N) {
 
   unsigned WidenSize = WidenVT.getSizeInBits();
   unsigned InSize = InVT.getSizeInBits();
+  unsigned InScalarSize = InVT.getScalarSizeInBits();
   // x86mmx is not an acceptable vector element type, so don't try.
-  if (WidenSize % InSize == 0 && InVT != MVT::x86mmx) {
+  if (WidenSize % InScalarSize == 0 && InVT != MVT::x86mmx) {
     // Determine new input vector type.  The new input vector type will use
     // the same element type (if its a vector) or use the input type as a
     // vector.  It is the same size as the type to widen to.
     EVT NewInVT;
-    unsigned NewNumElts = WidenSize / InSize;
+    unsigned NewNumParts = WidenSize / InSize;
     if (InVT.isVector()) {
       EVT InEltVT = InVT.getVectorElementType();
       NewInVT = EVT::getVectorVT(*DAG.getContext(), InEltVT,
                                  WidenSize / InEltVT.getSizeInBits());
     } else {
-      NewInVT = EVT::getVectorVT(*DAG.getContext(), InVT, NewNumElts);
+      NewInVT = EVT::getVectorVT(*DAG.getContext(), InVT, NewNumParts);
     }
 
     if (TLI.isTypeLegal(NewInVT)) {
       SDValue NewVec;
       if (InVT.isVector()) {
         // Because the result and the input are different vector types, widening
-        // the result could create a legal type but widening the input might make
-        // it an illegal type that might lead to repeatedly splitting the input
-        // and then widening it. To avoid this, we widen the input only if
+        // the result could create a legal type but widening the input might
+        // make it an illegal type that might lead to repeatedly splitting the
+        // input and then widening it. To avoid this, we widen the input only if
         // it results in a legal type.
-        SmallVector<SDValue, 16> Ops(NewNumElts, DAG.getUNDEF(InVT));
-        Ops[0] = InOp;
+        if (WidenSize % InSize == 0) {
+          SmallVector<SDValue, 16> Ops(NewNumParts, DAG.getUNDEF(InVT));
+          Ops[0] = InOp;
 
-        NewVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewInVT, Ops);
+          NewVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewInVT, Ops);
+        } else {
+          SmallVector<SDValue, 16> Ops;
+          DAG.ExtractVectorElements(InOp, Ops);
+          Ops.append(WidenSize / InScalarSize - Ops.size(),
+                     DAG.getUNDEF(InVT.getVectorElementType()));
+
+          NewVec = DAG.getNode(ISD::BUILD_VECTOR, dl, NewInVT, Ops);
+        }
       } else {
         NewVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, NewInVT, InOp);
       }
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.ll
index 757da88d8d1086..6027c3c96e8699 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.ll
@@ -381,3 +381,478 @@ end:
   store <4 x double> %phi_cast, ptr addrspace(1) %out
   ret void
 }
+
+; FUNC-LABEL: {{^}}bitcast_v20f16_to_v5f64:
+; SI: ScratchSize: 0
+define amdgpu_kernel void @bitcast_v20f16_to_v5f64(i32 %cond, ptr addrspace(1) %out, <20 x half> %value) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %if, label %end
+
+if:
+  %phi_value = phi <20 x half> [zeroinitializer, %entry], [%value, %if]
+  %cast = bitcast <20 x half> %phi_value to <5 x double>
+  %cmp1 = icmp eq i32 %cond, 1
+  br i1 %cmp1, label %if, label %end
+
+end:
+  %phi_cast = phi <5 x double> [zeroinitializer, %entry], [%cast, %if]
+  store <5 x double> %phi_cast, ptr addrspace(1) %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bitcast_v10f32_to_v5f64:
+; SI: ScratchSize: 0
+define amdgpu_kernel void @bitcast_v10f32_to_v5f64(i32 %cond, ptr addrspace(1) %out, <10 x float> %value) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %if, label %end
+
+if:
+  %phi_value = phi <10 x float> [zeroinitializer, %entry], [%value, %if]
+  %cast = bitcast <10 x float> %phi_value to <5 x double>
+  %cmp1 = icmp eq i32 %cond, 1
+  br i1 %cmp1, label %if, label %end
+
+end:
+  %phi_cast = phi <5 x double> [zeroinitializer, %entry], [%cast, %if]
+  store <5 x double> %phi_cast, ptr addrspace(1) %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bitcast_v10i32_to_v5f64:
+; SI: ScratchSize: 0
+define amdgpu_kernel void @bitcast_v10i32_to_v5f64(i32 %cond, ptr addrspace(1) %out, <10 x i32> %value) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %if, label %end
+
+if:
+  %phi_value = phi <10 x i32> [zeroinitializer, %entry], [%value, %if]
+  %cast = bitcast <10 x i32> %phi_value to <5 x double>
+  %cmp1 = icmp eq i32 %cond, 1
+  br i1 %cmp1, label %if, label %end
+
+end:
+  %phi_cast = phi <5 x double> [zeroinitializer, %entry], [%cast, %if]
+  store <5 x double> %phi_cast, ptr addrspace(1) %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bitcast_v10f32_to_v5i64:
+; SI: ScratchSize: 0
+define amdgpu_kernel void @bitcast_v10f32_to_v5i64(i32 %cond, ptr addrspace(1) %out, <10 x float> %value) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %if, label %end
+
+if:
+  %phi_value = phi <10 x float> [zeroinitializer, %entry], [%value, %if]
+  %cast = bitcast <10 x float> %phi_value to <5 x i64>
+  %cmp1 = icmp eq i32 %cond, 1
+  br i1 %cmp1, label %if, label %end
+
+end:
+  %phi_cast = phi <5 x i64> [zeroinitializer, %entry], [%cast, %if]
+  store <5 x i64> %phi_cast, ptr addrspace(1) %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bitcast_v10i32_to_v5i64:
+; SI: ScratchSize: 0
+define amdgpu_kernel void @bitcast_v10i32_to_v5i64(i32 %cond, ptr addrspace(1) %out, <10 x i32> %value) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %if, label %end
+
+if:
+  %phi_value = phi <10 x i32> [zeroinitializer, %entry], [%value, %if]
+  %cast = bitcast <10 x i32> %phi_value to <5 x i64>
+  %cmp1 = icmp eq i32 %cond, 1
+  br i1 %cmp1, label %if, label %end
+
+end:
+  %phi_cast = phi <5 x i64> [zeroinitializer, %entry], [%cast, %if]
+  store <5 x i64> %phi_cast, ptr addrspace(1) %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bitcast_v40i8_to_v5f64:
+; SI: ScratchSize: 0
+define amdgpu_kernel void @bitcast_v40i8_to_v5f64(i32 %cond, ptr addrspace(1) %out, <40 x i8> %value) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %if, label %end
+
+if:
+  %phi_value = phi <40 x i8> [zeroinitializer, %entry], [%value, %if]
+  %cast = bitcast <40 x i8> %phi_value to <5 x double>
+  %cmp1 = icmp eq i32 %cond, 1
+  br i1 %cmp1, label %if, label %end
+
+end:
+  %phi_cast = phi <5 x double> [zeroinitializer, %entry], [%cast, %if]
+  store <5 x double> %phi_cast, ptr addrspace(1) %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bitcast_v40i8_to_v5i64:
+; SI: ScratchSize: 0
+define amdgpu_kernel void @bitcast_v40i8_to_v5i64(i32 %cond, ptr addrspace(1) %out, <40 x i8> %value) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %if, label %end
+
+if:
+  %phi_value = phi <40 x i8> [zeroinitializer, %entry], [%value, %if]
+  %cast = bitcast <40 x i8> %phi_value to <5 x i64>
+  %cmp1 = icmp eq i32 %cond, 1
+  br i1 %cmp1, label %if, label %end
+
+end:
+  %phi_cast = phi <5 x i64> [zeroinitializer, %entry], [%cast, %if]
+  store <5 x i64> %phi_cast, ptr addrspace(1) %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bitcast_v5f64_to_v10f32:
+; SI: ScratchSize: 0
+define amdgpu_kernel void @bitcast_v5f64_to_v10f32(i32 %cond, ptr addrspace(1) %out, <5 x double> %value) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %if, label %end
+
+if:
+  %phi_value = phi <5 x double> [zeroinitializer, %entry], [%value, %if]
+  %cast = bitcast <5 x double> %phi_value to <10 x float>
+  %cmp1 = icmp eq i32 %cond, 1
+  br i1 %cmp1, label %if, label %end
+
+end:
+  %phi_cast = phi <10 x float> [zeroinitializer, %entry], [%cast, %if]
+  store <10 x float> %phi_cast, ptr addrspace(1) %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bitcast_v5f64_to_v10i32:
+; SI: ScratchSize: 0
+define amdgpu_kernel void @bitcast_v5f64_to_v10i32(i32 %cond, ptr addrspace(1) %out, <5 x double> %value) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %if, label %end
+
+if:
+  %phi_value = phi <5 x double> [zeroinitializer, %entry], [%value, %if]
+  %cast = bitcast <5 x double> %phi_value to <10 x i32>
+  %cmp1 = icmp eq i32 %cond, 1
+  br i1 %cmp1, label %if, label %end
+
+end:
+  %phi_cast = phi <10 x i32> [zeroinitializer, %entry], [%cast, %if]
+  store <10 x i32> %phi_cast, ptr addrspace(1) %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bitcast_v5i64_to_v10f32:
+; SI: ScratchSize: 0
+define amdgpu_kernel void @bitcast_v5i64_to_v10f32(i32 %cond, ptr addrspace(1) %out, <5 x i64> %value) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %if, label %end
+
+if:
+  %phi_value = phi <5 x i64> [zeroinitializer, %entry], [%value, %if]
+  %cast = bitcast <5 x i64> %phi_value to <10 x float>
+  %cmp1 = icmp eq i32 %cond, 1
+  br i1 %cmp1, label %if, label %end
+
+end:
+  %phi_cast = phi <10 x float> [zeroinitializer, %entry], [%cast, %if]
+  store <10 x float> %phi_cast, ptr addrspace(1) %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bitcast_v5i64_to_v10i32:
+; SI: ScratchSize: 0
+define amdgpu_kernel void @bitcast_v5i64_to_v10i32(i32 %cond, ptr addrspace(1) %out, <5 x i64> %value) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %if, label %end
+
+if:
+  %phi_value = phi <5 x i64> [zeroinitializer, %entry], [%value, %if]
+  %cast = bitcast <5 x i64> %phi_value to <10 x i32>
+  %cmp1 = icmp eq i32 %cond, 1
+  br i1 %cmp1, label %if, label %end
+
+end:
+  %phi_cast = phi <10 x i32> [zeroinitializer, %entry], [%cast, %if]
+  store <10 x i32> %phi_cast, ptr addrspace(1) %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bitcast_v6f64_to_v12i32:
+; SI: ScratchSize: 0
+define amdgpu_kernel void @bitcast_v6f64_to_v12i32(i32 %cond, ptr addrspace(1) %out, <6 x double> %value) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %if, label %end
+
+if:
+  %phi_value = phi <6 x double> [zeroinitializer, %entry], [%value, %if]
+  %cast = bitcast <6 x double> %phi_value to <12 x i32>
+  %cmp1 = icmp eq i32 %cond, 1
+  br i1 %cmp1, label %if, label %end
+
+end:
+  %phi_cast = phi <12 x i32> [zeroinitializer, %entry], [%cast, %if]
+  store <12 x i32> %phi_cast, ptr addrspace(1) %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bitcast_v6f64_to_v12f32:
+; SI: ScratchSize: 0
+define amdgpu_kernel void @bitcast_v6f64_to_v12f32(i32 %cond, ptr addrspace(1) %out, <6 x double> %value) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %if, label %end
+
+if:
+  %phi_value = phi <6 x double> [zeroinitializer, %entry], [%value, %if]
+  %cast = bitcast <6 x double> %phi_value to <12 x float>
+  %cmp1 = icmp eq i32 %cond, 1
+  br i1 %cmp1, label %if, label %end
+
+end:
+  %phi_cast = phi <12 x float> [zeroinitializer, %entry], [%cast, %if]
+  store <12 x float> %phi_cast, ptr addrspace(1) %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bitcast_v12i32_to_v6i64:
+; SI: ScratchSize: 0
+define amdgpu_kernel void @bitcast_v12i32_to_v6i64(i32 %cond, ptr addrspace(1) %out, <12 x i32> %value) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %if, label %end
+
+if:
+  %phi_value = phi <12 x i32> [zeroinitializer, %entry], [%value, %if]
+  %cast = bitcast <12 x i32> %phi_value to <6 x i64>
+  %cmp1 = icmp eq i32 %cond, 1
+  br i1 %cmp1, label %if, label %end
+
+end:
+  %phi_cast = phi <6 x i64> [zeroinitializer, %entry], [%cast, %if]
+  store <6 x i64> %phi_cast, ptr addrspace(1) %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bitcast_v12i32_to_v6f64:
+; SI: ScratchSize: 0
+define amdgpu_kernel void @bitcast_v12i32_to_v6f64(i32 %cond, ptr addrspace(1) %out, <12 x i32> %value) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %if, label %end
+
+if:
+  %phi_value = phi <12 x i32> [zeroinitializer, %entry], [%value, %if]
+  %cast = bitcast <12 x i32> %phi_value to <6 x double>
+  %cmp1 = icmp eq i32 %cond, 1
+  br i1 %cmp1, label %if, label %end
+
+end:
+  %phi_cast = phi <6 x double> [zeroinitializer, %entry], [%cast, %if]
+  store <6 x double> %phi_cast, ptr addrspace(1) %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bitcast_v6i64_to_v12i32:
+; SI: ScratchSize: 0
+define amdgpu_kernel void @bitcast_v6i64_to_v12i32(i32 %cond, ptr addrspace(1) %out, <6 x i64> %value) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %if, label %end
+
+if:
+  %phi_value = phi <6 x i64> [zeroinitializer, %entry], [%value, %if]
+  %cast = bitcast <6 x i64> %phi_value to <12 x i32>
+  %cmp1 = icmp eq i32 %cond, 1
+  br i1 %cmp1, label %if, label %end
+
+end:
+  %phi_cast = phi <12 x i32> [zeroinitializer, %entry], [%cast, %if]
+  store <12 x i32> %phi_cast, ptr addrspace(1) %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bitcast_v7i64_to_v14i32:
+; SI: ScratchSize: 0
+define amdgpu_kernel void @bitcast_v7i64_to_v14i32(i32 %cond, ptr addrspace(1) %out, <7 x i64> %value) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %if, label %end
+
+if:
+  %phi_value = phi <7 x i64> [zeroinitializer, %entry], [%value, %if]
+  %cast = bitcast <7 x i64> %phi_value to <14 x i32>
+  %cmp1 = icmp eq i32 %cond, 1
+  br i1 %cmp1, label %if, label %end
+
+end:
+  %phi_cast = phi <14 x i32> [zeroinitializer, %entry], [%cast, %if]
+  store <14 x i32> %phi_cast, ptr addrspace(1) %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bitcast_v7f64_to_v14i32:
+; SI: ScratchSize: 0
+define amdgpu_kernel void @bitcast_v7f64_to_v14i32(i32 %cond, ptr addrspace(1) %out, <7 x double> %value) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %if, label %end
+
+if:
+  %phi_value = phi <7 x double> [zeroinitializer, %entry], [%value, %if]
+  %cast = bitcast <7 x double> %phi_value to <14 x i32>
+  %cmp1 = icmp eq i32 %cond, 1
+  br i1 %cmp1, label %if, label %end
+
+end:
+  %phi_cast = phi <14 x i32> [zeroinitializer, %entry], [%cast, %if]
+  store <14 x i32> %phi_cast, ptr addrspace(1) %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bitcast_v9i64_to_v18i32:
+; SI: ScratchSize: 0
+define amdgpu_kernel void @bitcast_v9i64_to_v18i32(i32 %cond, ptr addrspace(1) %out, <9 x i64> %value) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %if, label %end
+
+if:
+  %phi_value = phi <9 x i64> [zeroinitializer, %entry], [%value, %if]
+  %cast = bitcast <9 x i64> %phi_value to <18 x i32>
+  %cmp1 = icmp eq i32 %cond, 1
+  br i1 %cmp1, label %if, label %end
+
+end:
+  %phi_cast = phi <18 x i32> [zeroinitializer, %entry], [%cast, %if]
+  store <18 x i32> %phi_cast, ptr addrspace(1) %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bitcast_v10i64_to_v20i32:
+; SI: ScratchSize: 0
+define amdgpu_kernel void @bitcast_v10i64_to_v20i32(i32 %cond, ptr addrspace(1) %out, <10 x i64> %value) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %if, label %end
+
+if:
+  %phi_value = phi <10 x i64> [zeroinitializer, %entry], [%value, %if]
+  %cast = bitcast <10 x i64> %phi_value to <20 x i32>
+  %cmp1 = icmp eq i32 %cond, 1
+  br i1 %cmp1, label %if, label %end
+
+end:
+  %phi_cast = phi <20 x i32> [zeroinitializer, %entry], [%cast, %if]
+  store <20 x i32> %phi_cast, ptr addrspace(1) %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bitcast_v11i64_to_v20i32:
+; SI: ScratchSize: 0
+define amdgpu_kernel void @bitcast_v11i64_to_v20i32(i32 %cond, ptr addrspace(1) %out, <11 x i64> %value) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %if, label %end
+
+if:
+  %phi_value = phi <11 x i64> [zeroinitializer, %entry], [%value, %if]
+  %cast = bitcast <11 x i64> %phi_value to <22 x i32>
+  %cmp1 = icmp eq i32 %cond, 1
+  br i1 %cmp1, label %if, label %end
+
+end:
+  %phi_cast = phi <22 x i32> [zeroinitializer, %entry], [%cast, %if]
+  store <22 x i32> %phi_cast, ptr addrspace(1) %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bitcast_v12i64_to_v22i32:
+; SI: ScratchSize: 0
+define amdgpu_kernel void @bitcast_v12i64_to_v22i32(i32 %cond, ptr addrspace(1) %out, <12 x i64> %value) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %if, label %end
+
+if:
+  %phi_value = phi <12 x i64> [zeroinitializer, %entry], [%value, %if]
+  %cast = bitcast <12 x i64> %phi_value to <24 x i32>
+  %cmp1 = icmp eq i32 %cond, 1
+  br i1 %cmp1, label %if, label %end
+
+end:
+  %phi_cast = phi <24 x i32> [zeroinitializer, %entry], [%cast, %if]
+  store <24 x i32> %phi_cast, ptr addrspace(1) %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bitcast_v13i64_to_v24i32:
+; SI: ScratchSize: 0
+define amdgpu_kernel void @bitcast_v13i64_to_v24i32(i32 %cond, ptr addrspace(1) %out, <13 x i64> %value) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %if, label %end
+
+if:
+  %phi_value = phi <13 x i64> [zeroinitializer, %entry], [%value, %if]
+  %cast = bitcast <13 x i64> %phi_value to <26 x i32>
+  %cmp1 = icmp eq i32 %cond, 1
+  br i1 %cmp1, label %if, label %end
+
+end:
+  %phi_cast = phi <26 x i32> [zeroinitializer, %entry], [%cast, %if]
+  store <26 x i32> %phi_cast, ptr addrspace(1) %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bitcast_v14i64_to_v26i32:
+; SI: ScratchSize: 0
+define amdgpu_kernel void @bitcast_v14i64_to_v26i32(i32 %cond, ptr addrspace(1) %out, <14 x i64> %value) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %if, label %end
+
+if:
+  %phi_value = phi <14 x i64> [zeroinitializer, %entry], [%value, %if]
+  %cast = bitcast <14 x i64> %phi_value to <28 x i32>
+  %cmp1 = icmp eq i32 %cond, 1
+  br i1 %cmp1, label %if, label %end
+
+end:
+  %phi_cast = phi <28 x i32> [zeroinitializer, %entry], [%cast, %if]
+  store <28 x i32> %phi_cast, ptr addrspace(1) %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bitcast_v15i64_to_v26i32:
+; SI: ScratchSize: 0
+define amdgpu_kernel void @bitcast_v15i64_to_v26i32(i32 %cond, ptr addrspace(1) %out, <15 x i64> %value) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %if, label %end
+
+if:
+  %phi_value = phi <15 x i64> [zeroinitializer, %entry], [%value, %if]
+  %cast = bitcast <15 x i64> %phi_value to <30 x i32>
+  %cmp1 = icmp eq i32 %cond, 1
+  br i1 %cmp1, label %if, label %end
+
+end:
+  %phi_cast = phi <30 x i32> [zeroinitializer, %entry], [%cast, %if]
+  store <30 x i32> %phi_cast, ptr addrspace(1) %out
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll b/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll
index b006c6b2aff7ab..3145c1c3e868bc 100644
--- a/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll
+++ b/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll
@@ -171,20 +171,14 @@ define amdgpu_kernel void @test_copy_v4i8_x4(ptr addrspace(1) %out0, ptr addrspa
 ; SI-NEXT:    v_mov_b32_e32 v1, 0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
-; SI-NEXT:    s_mov_b32 s24, SCRATCH_RSRC_DWORD0
-; SI-NEXT:    s_mov_b32 s25, SCRATCH_RSRC_DWORD1
-; SI-NEXT:    s_mov_b32 s26, -1
-; SI-NEXT:    s_mov_b32 s27, 0xe8f000
-; SI-NEXT:    s_add_u32 s24, s24, s3
 ; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
 ; SI-NEXT:    s_mov_b32 s10, -1
-; SI-NEXT:    s_addc_u32 s25, s25, 0
 ; SI-NEXT:    s_mov_b32 s14, s10
 ; SI-NEXT:    s_mov_b32 s15, s11
+; SI-NEXT:    s_mov_b32 s18, s10
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s8, s0
 ; SI-NEXT:    s_mov_b32 s9, s1
-; SI-NEXT:    s_mov_b32 s18, s10
 ; SI-NEXT:    s_mov_b32 s19, s11
 ; SI-NEXT:    s_mov_b32 s22, s10
 ; SI-NEXT:    s_mov_b32 s23, s11
@@ -203,30 +197,24 @@ define amdgpu_kernel void @test_copy_v4i8_x4(ptr addrspace(1) %out0, ptr addrspa
 ;
 ; VI-LABEL: test_copy_v4i8_x4:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_mov_b32 s88, SCRATCH_RSRC_DWORD0
-; VI-NEXT:    s_mov_b32 s89, SCRATCH_RSRC_DWORD1
-; VI-NEXT:    s_mov_b32 s90, -1
-; VI-NEXT:    s_mov_b32 s91, 0xe80000
-; VI-NEXT:    s_add_u32 s88, s88, s3
 ; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x44
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; VI-NEXT:    s_mov_b32 s11, 0xf000
 ; VI-NEXT:    s_mov_b32 s10, -1
-; VI-NEXT:    s_addc_u32 s89, s89, 0
+; VI-NEXT:    s_mov_b32 s14, s10
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v0, v[0:1]
 ; VI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
-; VI-NEXT:    s_mov_b32 s14, s10
 ; VI-NEXT:    s_mov_b32 s15, s11
 ; VI-NEXT:    s_mov_b32 s18, s10
 ; VI-NEXT:    s_mov_b32 s19, s11
+; VI-NEXT:    s_mov_b32 s22, s10
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_mov_b32 s8, s0
 ; VI-NEXT:    s_mov_b32 s9, s1
-; VI-NEXT:    s_mov_b32 s22, s10
 ; VI-NEXT:    s_mov_b32 s23, s11
 ; VI-NEXT:    s_mov_b32 s12, s2
 ; VI-NEXT:    s_mov_b32 s13, s3
diff --git a/llvm/test/CodeGen/AMDGPU/copy-to-reg-scc-clobber.ll b/llvm/test/CodeGen/AMDGPU/copy-to-reg-scc-clobber.ll
index 2b4651487eff6c..2b8a712b28c054 100644
--- a/llvm/test/CodeGen/AMDGPU/copy-to-reg-scc-clobber.ll
+++ b/llvm/test/CodeGen/AMDGPU/copy-to-reg-scc-clobber.ll
@@ -7,27 +7,21 @@ define protected amdgpu_kernel void @sccClobber(ptr addrspace(1) %a, ptr addrspa
 ; RRLIST-LABEL: sccClobber:
 ; RRLIST:       ; %bb.0: ; %entry
 ; RRLIST-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x24
-; RRLIST-NEXT:    s_mov_b32 s20, SCRATCH_RSRC_DWORD0
-; RRLIST-NEXT:    s_mov_b32 s21, SCRATCH_RSRC_DWORD1
-; RRLIST-NEXT:    s_mov_b32 s22, -1
-; RRLIST-NEXT:    s_mov_b32 s23, 0xe00000
-; RRLIST-NEXT:    s_add_u32 s20, s20, s3
+; RRLIST-NEXT:    v_mov_b32_e32 v2, 0
 ; RRLIST-NEXT:    s_waitcnt lgkmcnt(0)
 ; RRLIST-NEXT:    s_load_dword s16, s[8:9], 0x0
 ; RRLIST-NEXT:    s_load_dwordx2 s[2:3], s[6:7], 0x0
 ; RRLIST-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x0
 ; RRLIST-NEXT:    s_load_dwordx2 s[14:15], s[0:1], 0x44
 ; RRLIST-NEXT:    s_load_dword s17, s[10:11], 0x0
-; RRLIST-NEXT:    s_addc_u32 s21, s21, 0
 ; RRLIST-NEXT:    s_waitcnt lgkmcnt(0)
-; RRLIST-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
 ; RRLIST-NEXT:    s_min_i32 s4, s16, 0
+; RRLIST-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
 ; RRLIST-NEXT:    v_cmp_lt_i64_e32 vcc, s[12:13], v[0:1]
 ; RRLIST-NEXT:    s_and_b64 s[0:1], vcc, exec
 ; RRLIST-NEXT:    s_cselect_b32 s0, s16, s17
 ; RRLIST-NEXT:    s_cmp_eq_u64 s[12:13], s[2:3]
 ; RRLIST-NEXT:    s_cselect_b32 s0, s4, s0
-; RRLIST-NEXT:    v_mov_b32_e32 v2, 0
 ; RRLIST-NEXT:    v_mov_b32_e32 v0, s0
 ; RRLIST-NEXT:    global_store_dword v2, v0, s[14:15]
 ; RRLIST-NEXT:    s_endpgm
@@ -35,27 +29,21 @@ define protected amdgpu_kernel void @sccClobber(ptr addrspace(1) %a, ptr addrspa
 ; FAST-LABEL: sccClobber:
 ; FAST:       ; %bb.0: ; %entry
 ; FAST-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x24
-; FAST-NEXT:    s_mov_b32 s20, SCRATCH_RSRC_DWORD0
-; FAST-NEXT:    s_mov_b32 s21, SCRATCH_RSRC_DWORD1
-; FAST-NEXT:    s_mov_b32 s22, -1
-; FAST-NEXT:    s_mov_b32 s23, 0xe00000
-; FAST-NEXT:    s_add_u32 s20, s20, s3
+; FAST-NEXT:    v_mov_b32_e32 v2, 0
 ; FAST-NEXT:    s_waitcnt lgkmcnt(0)
 ; FAST-NEXT:    s_load_dword s16, s[8:9], 0x0
 ; FAST-NEXT:    s_load_dwordx2 s[2:3], s[6:7], 0x0
 ; FAST-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x0
 ; FAST-NEXT:    s_load_dwordx2 s[14:15], s[0:1], 0x44
 ; FAST-NEXT:    s_load_dword s17, s[10:11], 0x0
-; FAST-NEXT:    s_addc_u32 s21, s21, 0
 ; FAST-NEXT:    s_waitcnt lgkmcnt(0)
-; FAST-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
 ; FAST-NEXT:    s_min_i32 s4, s16, 0
+; FAST-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
 ; FAST-NEXT:    v_cmp_lt_i64_e32 vcc, s[12:13], v[0:1]
 ; FAST-NEXT:    s_and_b64 s[0:1], vcc, exec
 ; FAST-NEXT:    s_cselect_b32 s0, s16, s17
 ; FAST-NEXT:    s_cmp_eq_u64 s[12:13], s[2:3]
 ; FAST-NEXT:    s_cselect_b32 s0, s4, s0
-; FAST-NEXT:    v_mov_b32_e32 v2, 0
 ; FAST-NEXT:    v_mov_b32_e32 v0, s0
 ; FAST-NEXT:    global_store_dword v2, v0, s[14:15]
 ; FAST-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll
index 41a4998b3ba918..027f3c360b426b 100644
--- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll
@@ -4032,14 +4032,8 @@ entry:
 define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index, i64 %old) {
 ; GCN1-LABEL: atomic_cmpxchg_i64_ret_addr64_offset:
 ; GCN1:       ; %bb.0: ; %entry
-; GCN1-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GCN1-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
 ; GCN1-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x9
 ; GCN1-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x11
-; GCN1-NEXT:    s_mov_b32 s14, -1
-; GCN1-NEXT:    s_mov_b32 s15, 0xe8f000
-; GCN1-NEXT:    s_add_u32 s12, s12, s3
-; GCN1-NEXT:    s_addc_u32 s13, s13, 0
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN1-NEXT:    s_lshl_b64 s[2:3], s[10:11], 3
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s0
@@ -4063,14 +4057,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr %out, ptr %o
 ;
 ; GCN2-LABEL: atomic_cmpxchg_i64_ret_addr64_offset:
 ; GCN2:       ; %bb.0: ; %entry
-; GCN2-NEXT:    s_mov_b32 s88, SCRATCH_RSRC_DWORD0
-; GCN2-NEXT:    s_mov_b32 s89, SCRATCH_RSRC_DWORD1
 ; GCN2-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x24
 ; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x44
-; GCN2-NEXT:    s_mov_b32 s90, -1
-; GCN2-NEXT:    s_mov_b32 s91, 0xe80000
-; GCN2-NEXT:    s_add_u32 s88, s88, s3
-; GCN2-NEXT:    s_addc_u32 s89, s89, 0
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN2-NEXT:    s_lshl_b64 s[2:3], s[10:11], 3
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s0
@@ -4231,14 +4219,8 @@ entry:
 define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index, i64 %old) {
 ; GCN1-LABEL: atomic_cmpxchg_i64_ret_addr64:
 ; GCN1:       ; %bb.0: ; %entry
-; GCN1-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GCN1-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
 ; GCN1-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x9
 ; GCN1-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x11
-; GCN1-NEXT:    s_mov_b32 s14, -1
-; GCN1-NEXT:    s_mov_b32 s15, 0xe8f000
-; GCN1-NEXT:    s_add_u32 s12, s12, s3
-; GCN1-NEXT:    s_addc_u32 s13, s13, 0
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN1-NEXT:    s_lshl_b64 s[2:3], s[10:11], 3
 ; GCN1-NEXT:    s_add_u32 s2, s4, s2
@@ -4260,14 +4242,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr %out, ptr %out2, i6
 ;
 ; GCN2-LABEL: atomic_cmpxchg_i64_ret_addr64:
 ; GCN2:       ; %bb.0: ; %entry
-; GCN2-NEXT:    s_mov_b32 s88, SCRATCH_RSRC_DWORD0
-; GCN2-NEXT:    s_mov_b32 s89, SCRATCH_RSRC_DWORD1
 ; GCN2-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x24
 ; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x44
-; GCN2-NEXT:    s_mov_b32 s90, -1
-; GCN2-NEXT:    s_mov_b32 s91, 0xe80000
-; GCN2-NEXT:    s_add_u32 s88, s88, s3
-; GCN2-NEXT:    s_addc_u32 s89, s89, 0
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN2-NEXT:    s_lshl_b64 s[2:3], s[10:11], 3
 ; GCN2-NEXT:    s_add_u32 s2, s4, s2
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll
index 48aa210665e79e..d295efc6d015f7 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll
@@ -5010,17 +5010,12 @@ entry:
 define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index, i64 %old) {
 ; CI-LABEL: atomic_cmpxchg_i64_ret_addr64_offset:
 ; CI:       ; %bb.0: ; %entry
-; CI-NEXT:    s_mov_b32 s16, SCRATCH_RSRC_DWORD0
 ; CI-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x9
-; CI-NEXT:    s_mov_b32 s17, SCRATCH_RSRC_DWORD1
-; CI-NEXT:    s_mov_b32 s18, -1
 ; CI-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x11
-; CI-NEXT:    s_mov_b32 s19, 0xe8f000
-; CI-NEXT:    s_add_u32 s16, s16, s3
-; CI-NEXT:    s_addc_u32 s17, s17, 0
+; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    s_mov_b32 s2, -1
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_lshl_b64 s[10:11], s[10:11], 3
-; CI-NEXT:    s_mov_b32 s3, 0xf000
 ; CI-NEXT:    v_mov_b32_e32 v4, s10
 ; CI-NEXT:    s_mov_b32 s0, s6
 ; CI-NEXT:    s_mov_b32 s1, s7
@@ -5035,20 +5030,13 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr addrspace(1)
 ; CI-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 offset:32 glc
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    buffer_wbinvl1_vol
-; CI-NEXT:    s_mov_b32 s2, -1
 ; CI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; CI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: atomic_cmpxchg_i64_ret_addr64_offset:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_mov_b32 s88, SCRATCH_RSRC_DWORD0
-; VI-NEXT:    s_mov_b32 s89, SCRATCH_RSRC_DWORD1
 ; VI-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x24
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x44
-; VI-NEXT:    s_mov_b32 s90, -1
-; VI-NEXT:    s_mov_b32 s91, 0xe80000
-; VI-NEXT:    s_add_u32 s88, s88, s3
-; VI-NEXT:    s_addc_u32 s89, s89, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_lshl_b64 s[2:3], s[10:11], 3
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
@@ -5074,18 +5062,12 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr addrspace(1)
 ;
 ; GFX9-LABEL: atomic_cmpxchg_i64_ret_addr64_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
 ; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x24
-; GFX9-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT:    s_mov_b32 s14, -1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x44
-; GFX9-NEXT:    s_mov_b32 s15, 0xe00000
-; GFX9-NEXT:    s_add_u32 s12, s12, s3
-; GFX9-NEXT:    s_addc_u32 s13, s13, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], s[10:11], 3
 ; GFX9-NEXT:    s_add_u32 s2, s4, s2
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s8
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s9
 ; GFX9-NEXT:    s_addc_u32 s3, s5, s3
@@ -5296,17 +5278,12 @@ entry:
 define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index, i64 %old) {
 ; CI-LABEL: atomic_cmpxchg_i64_ret_addr64:
 ; CI:       ; %bb.0: ; %entry
-; CI-NEXT:    s_mov_b32 s16, SCRATCH_RSRC_DWORD0
 ; CI-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x9
-; CI-NEXT:    s_mov_b32 s17, SCRATCH_RSRC_DWORD1
-; CI-NEXT:    s_mov_b32 s18, -1
 ; CI-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x11
-; CI-NEXT:    s_mov_b32 s19, 0xe8f000
-; CI-NEXT:    s_add_u32 s16, s16, s3
-; CI-NEXT:    s_addc_u32 s17, s17, 0
+; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    s_mov_b32 s2, -1
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_lshl_b64 s[10:11], s[10:11], 3
-; CI-NEXT:    s_mov_b32 s3, 0xf000
 ; CI-NEXT:    v_mov_b32_e32 v4, s10
 ; CI-NEXT:    s_mov_b32 s0, s6
 ; CI-NEXT:    s_mov_b32 s1, s7
@@ -5321,20 +5298,13 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr addrspace(1) %out,
 ; CI-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 glc
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    buffer_wbinvl1_vol
-; CI-NEXT:    s_mov_b32 s2, -1
 ; CI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; CI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: atomic_cmpxchg_i64_ret_addr64:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_mov_b32 s88, SCRATCH_RSRC_DWORD0
-; VI-NEXT:    s_mov_b32 s89, SCRATCH_RSRC_DWORD1
 ; VI-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x24
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x44
-; VI-NEXT:    s_mov_b32 s90, -1
-; VI-NEXT:    s_mov_b32 s91, 0xe80000
-; VI-NEXT:    s_add_u32 s88, s88, s3
-; VI-NEXT:    s_addc_u32 s89, s89, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_lshl_b64 s[2:3], s[10:11], 3
 ; VI-NEXT:    s_add_u32 s2, s4, s2
@@ -5358,18 +5328,12 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr addrspace(1) %out,
 ;
 ; GFX9-LABEL: atomic_cmpxchg_i64_ret_addr64:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
 ; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x24
-; GFX9-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT:    s_mov_b32 s14, -1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x44
-; GFX9-NEXT:    s_mov_b32 s15, 0xe00000
-; GFX9-NEXT:    s_add_u32 s12, s12, s3
-; GFX9-NEXT:    s_addc_u32 s13, s13, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], s[10:11], 3
 ; GFX9-NEXT:    s_add_u32 s2, s4, s2
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s8
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s9
 ; GFX9-NEXT:    s_addc_u32 s3, s5, s3
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
index 032b8b89fb4eed..03e1960ca7c6aa 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
@@ -610,16 +610,10 @@ entry:
 define amdgpu_kernel void @double5_inselt(ptr addrspace(1) %out, <5 x double> %vec, i32 %sel) {
 ; GCN-LABEL: double5_inselt:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_mov_b32 s16, SCRATCH_RSRC_DWORD0
-; GCN-NEXT:    s_mov_b32 s17, SCRATCH_RSRC_DWORD1
-; GCN-NEXT:    s_mov_b32 s18, -1
-; GCN-NEXT:    s_mov_b32 s19, 0xe80000
-; GCN-NEXT:    s_add_u32 s16, s16, s3
 ; GCN-NEXT:    s_load_dword s12, s[0:1], 0xa4
 ; GCN-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x84
 ; GCN-NEXT:    s_load_dwordx2 s[10:11], s[0:1], 0x24
 ; GCN-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x64
-; GCN-NEXT:    s_addc_u32 s17, s17, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_cmp_eq_u32 s12, 4
 ; GCN-NEXT:    s_cselect_b32 s9, 0x3ff00000, s9
diff --git a/llvm/test/CodeGen/AMDGPU/select.f16.ll b/llvm/test/CodeGen/AMDGPU/select.f16.ll
index abee0b2d9c5b45..be01fa8ab1e635 100644
--- a/llvm/test/CodeGen/AMDGPU/select.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/select.f16.ll
@@ -5,19 +5,14 @@
 define amdgpu_kernel void @select_f16(
 ; SI-LABEL: select_f16:
 ; SI:       ; %bb.0: ; %entry
-; SI-NEXT:    s_mov_b32 s24, SCRATCH_RSRC_DWORD0
 ; SI-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s25, SCRATCH_RSRC_DWORD1
-; SI-NEXT:    s_mov_b32 s26, -1
-; SI-NEXT:    s_mov_b32 s27, 0xe8f000
-; SI-NEXT:    s_add_u32 s24, s24, s3
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x11
+; SI-NEXT:    s_mov_b32 s18, s2
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s16, s6
 ; SI-NEXT:    s_mov_b32 s17, s7
-; SI-NEXT:    s_mov_b32 s18, s2
 ; SI-NEXT:    s_mov_b32 s19, s3
 ; SI-NEXT:    s_mov_b32 s20, s8
 ; SI-NEXT:    s_mov_b32 s21, s9
@@ -39,7 +34,6 @@ define amdgpu_kernel void @select_f16(
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    s_mov_b32 s0, s4
 ; SI-NEXT:    s_mov_b32 s1, s5
-; SI-NEXT:    s_addc_u32 s25, s25, 0
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
@@ -52,19 +46,14 @@ define amdgpu_kernel void @select_f16(
 ;
 ; VI-LABEL: select_f16:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_mov_b32 s24, SCRATCH_RSRC_DWORD0
 ; VI-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x24
-; VI-NEXT:    s_mov_b32 s25, SCRATCH_RSRC_DWORD1
-; VI-NEXT:    s_mov_b32 s26, -1
-; VI-NEXT:    s_mov_b32 s27, 0xe80000
-; VI-NEXT:    s_add_u32 s24, s24, s3
 ; VI-NEXT:    s_mov_b32 s3, 0xf000
 ; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x44
+; VI-NEXT:    s_mov_b32 s18, s2
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_mov_b32 s16, s6
 ; VI-NEXT:    s_mov_b32 s17, s7
-; VI-NEXT:    s_mov_b32 s18, s2
 ; VI-NEXT:    s_mov_b32 s19, s3
 ; VI-NEXT:    s_mov_b32 s20, s8
 ; VI-NEXT:    s_mov_b32 s21, s9
@@ -86,7 +75,6 @@ define amdgpu_kernel void @select_f16(
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_mov_b32 s0, s4
 ; VI-NEXT:    s_mov_b32 s1, s5
-; VI-NEXT:    s_addc_u32 s25, s25, 0
 ; VI-NEXT:    v_cmp_lt_f16_e32 vcc, v0, v1
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
 ; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
@@ -432,19 +420,14 @@ entry:
 define amdgpu_kernel void @select_v2f16(
 ; SI-LABEL: select_v2f16:
 ; SI:       ; %bb.0: ; %entry
-; SI-NEXT:    s_mov_b32 s24, SCRATCH_RSRC_DWORD0
 ; SI-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s25, SCRATCH_RSRC_DWORD1
-; SI-NEXT:    s_mov_b32 s26, -1
 ; SI-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x11
-; SI-NEXT:    s_mov_b32 s27, 0xe8f000
-; SI-NEXT:    s_add_u32 s24, s24, s3
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_mov_b32 s18, s2
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s16, s6
 ; SI-NEXT:    s_mov_b32 s17, s7
-; SI-NEXT:    s_mov_b32 s18, s2
 ; SI-NEXT:    s_mov_b32 s19, s3
 ; SI-NEXT:    s_mov_b32 s20, s8
 ; SI-NEXT:    s_mov_b32 s21, s9
@@ -462,7 +445,6 @@ define amdgpu_kernel void @select_v2f16(
 ; SI-NEXT:    buffer_load_dword v3, off, s[8:11], 0
 ; SI-NEXT:    s_mov_b32 s0, s4
 ; SI-NEXT:    s_mov_b32 s1, s5
-; SI-NEXT:    s_addc_u32 s25, s25, 0
 ; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
 ; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
@@ -492,13 +474,8 @@ define amdgpu_kernel void @select_v2f16(
 ;
 ; VI-LABEL: select_v2f16:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_mov_b32 s24, SCRATCH_RSRC_DWORD0
-; VI-NEXT:    s_mov_b32 s25, SCRATCH_RSRC_DWORD1
 ; VI-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x24
 ; VI-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x44
-; VI-NEXT:    s_mov_b32 s26, -1
-; VI-NEXT:    s_mov_b32 s27, 0xe80000
-; VI-NEXT:    s_add_u32 s24, s24, s3
 ; VI-NEXT:    s_mov_b32 s3, 0xf000
 ; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_mov_b32 s14, s2
@@ -522,7 +499,6 @@ define amdgpu_kernel void @select_v2f16(
 ; VI-NEXT:    buffer_load_dword v3, off, s[8:11], 0
 ; VI-NEXT:    s_mov_b32 s0, s4
 ; VI-NEXT:    s_mov_b32 s1, s5
-; VI-NEXT:    s_addc_u32 s25, s25, 0
 ; VI-NEXT:    s_waitcnt vmcnt(3)
 ; VI-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
 ; VI-NEXT:    s_waitcnt vmcnt(2)
diff --git a/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll b/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll
index 433368bf616eb5..9b92e03a01c117 100644
--- a/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll
@@ -67,19 +67,14 @@ entry:
 define amdgpu_kernel void @madak_f16_use_2(
 ; SI-LABEL: madak_f16_use_2:
 ; SI:       ; %bb.0: ; %entry
-; SI-NEXT:    s_mov_b32 s20, SCRATCH_RSRC_DWORD0
 ; SI-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s21, SCRATCH_RSRC_DWORD1
-; SI-NEXT:    s_mov_b32 s22, -1
-; SI-NEXT:    s_mov_b32 s23, 0xe8f000
 ; SI-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x11
-; SI-NEXT:    s_add_u32 s20, s20, s3
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_mov_b32 s18, s2
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s16, s8
 ; SI-NEXT:    s_mov_b32 s17, s9
-; SI-NEXT:    s_mov_b32 s18, s2
 ; SI-NEXT:    s_mov_b32 s19, s3
 ; SI-NEXT:    s_mov_b32 s8, s10
 ; SI-NEXT:    s_mov_b32 s9, s11
@@ -96,7 +91,6 @@ define amdgpu_kernel void @madak_f16_use_2(
 ; SI-NEXT:    v_mov_b32_e32 v3, 0x41200000
 ; SI-NEXT:    s_mov_b32 s0, s4
 ; SI-NEXT:    s_mov_b32 s1, s5
-; SI-NEXT:    s_addc_u32 s21, s21, 0
 ; SI-NEXT:    s_mov_b32 s8, s6
 ; SI-NEXT:    s_mov_b32 s9, s7
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
@@ -112,19 +106,14 @@ define amdgpu_kernel void @madak_f16_use_2(
 ;
 ; VI-LABEL: madak_f16_use_2:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_mov_b32 s20, SCRATCH_RSRC_DWORD0
 ; VI-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x24
-; VI-NEXT:    s_mov_b32 s21, SCRATCH_RSRC_DWORD1
-; VI-NEXT:    s_mov_b32 s22, -1
-; VI-NEXT:    s_mov_b32 s23, 0xe80000
 ; VI-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x44
-; VI-NEXT:    s_add_u32 s20, s20, s3
 ; VI-NEXT:    s_mov_b32 s3, 0xf000
 ; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_mov_b32 s18, s2
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_mov_b32 s16, s8
 ; VI-NEXT:    s_mov_b32 s17, s9
-; VI-NEXT:    s_mov_b32 s18, s2
 ; VI-NEXT:    s_mov_b32 s19, s3
 ; VI-NEXT:    s_mov_b32 s8, s10
 ; VI-NEXT:    s_mov_b32 s9, s11
@@ -141,7 +130,6 @@ define amdgpu_kernel void @madak_f16_use_2(
 ; VI-NEXT:    v_mov_b32_e32 v3, 0x4900
 ; VI-NEXT:    s_mov_b32 s0, s4
 ; VI-NEXT:    s_mov_b32 s1, s5
-; VI-NEXT:    s_addc_u32 s21, s21, 0
 ; VI-NEXT:    s_mov_b32 s8, s6
 ; VI-NEXT:    s_mov_b32 s9, s7
 ; VI-NEXT:    v_madak_f16 v1, v0, v1, 0x4900
diff --git a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
index 584d6354840a94..e6bc773c272bdd 100644
--- a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
@@ -1660,19 +1660,13 @@ define <6 x half> @shuffle_v6f16_452367(ptr addrspace(1) %arg0, ptr addrspace(1)
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_load_b96 v[4:6], v[0:1], off
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    scratch_store_b96 off, v[4:6], s32
-; GFX11-NEXT:    global_load_b96 v[4:6], v[2:3], off
-; GFX11-NEXT:    scratch_load_b128 v[0:3], off, s32
-; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    scratch_store_b96 off, v[4:6], s32 offset:16
-; GFX11-NEXT:    scratch_load_b32 v3, off, s32 offset:16
+; GFX11-NEXT:    v_dual_mov_b32 v4, v3 :: v_dual_mov_b32 v3, v2
+; GFX11-NEXT:    global_load_b96 v[0:2], v[0:1], off
+; GFX11-NEXT:    global_load_b32 v3, v[3:4], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v2, v3
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <6 x half>, ptr addrspace(1) %arg0
   %val1 = load <6 x half>, ptr addrspace(1) %arg1