diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp index f7384ced3e23e4..af5ea1ce5f4597 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -4850,33 +4850,43 @@ SDValue DAGTypeLegalizer::WidenVecRes_BITCAST(SDNode *N) { unsigned WidenSize = WidenVT.getSizeInBits(); unsigned InSize = InVT.getSizeInBits(); + unsigned InScalarSize = InVT.getScalarSizeInBits(); // x86mmx is not an acceptable vector element type, so don't try. - if (WidenSize % InSize == 0 && InVT != MVT::x86mmx) { + if (WidenSize % InScalarSize == 0 && InVT != MVT::x86mmx) { // Determine new input vector type. The new input vector type will use // the same element type (if its a vector) or use the input type as a // vector. It is the same size as the type to widen to. EVT NewInVT; - unsigned NewNumElts = WidenSize / InSize; + unsigned NewNumParts = WidenSize / InSize; if (InVT.isVector()) { EVT InEltVT = InVT.getVectorElementType(); NewInVT = EVT::getVectorVT(*DAG.getContext(), InEltVT, WidenSize / InEltVT.getSizeInBits()); } else { - NewInVT = EVT::getVectorVT(*DAG.getContext(), InVT, NewNumElts); + NewInVT = EVT::getVectorVT(*DAG.getContext(), InVT, NewNumParts); } if (TLI.isTypeLegal(NewInVT)) { SDValue NewVec; if (InVT.isVector()) { // Because the result and the input are different vector types, widening - // the result could create a legal type but widening the input might make - // it an illegal type that might lead to repeatedly splitting the input - // and then widening it. To avoid this, we widen the input only if + // the result could create a legal type but widening the input might + // make it an illegal type that might lead to repeatedly splitting the + // input and then widening it. To avoid this, we widen the input only if // it results in a legal type. - SmallVector Ops(NewNumElts, DAG.getUNDEF(InVT)); - Ops[0] = InOp; + if (WidenSize % InSize == 0) { + SmallVector Ops(NewNumParts, DAG.getUNDEF(InVT)); + Ops[0] = InOp; - NewVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewInVT, Ops); + NewVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewInVT, Ops); + } else { + SmallVector Ops; + DAG.ExtractVectorElements(InOp, Ops); + Ops.append(WidenSize / InScalarSize - Ops.size(), + DAG.getUNDEF(InVT.getVectorElementType())); + + NewVec = DAG.getNode(ISD::BUILD_VECTOR, dl, NewInVT, Ops); + } } else { NewVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, NewInVT, InOp); } diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.ll index 757da88d8d1086..6027c3c96e8699 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.ll @@ -381,3 +381,478 @@ end: store <4 x double> %phi_cast, ptr addrspace(1) %out ret void } + +; FUNC-LABEL: {{^}}bitcast_v20f16_to_v5f64: +; SI: ScratchSize: 0 +define amdgpu_kernel void @bitcast_v20f16_to_v5f64(i32 %cond, ptr addrspace(1) %out, <20 x half> %value) { +entry: + %cmp0 = icmp eq i32 %cond, 0 + br i1 %cmp0, label %if, label %end + +if: + %phi_value = phi <20 x half> [zeroinitializer, %entry], [%value, %if] + %cast = bitcast <20 x half> %phi_value to <5 x double> + %cmp1 = icmp eq i32 %cond, 1 + br i1 %cmp1, label %if, label %end + +end: + %phi_cast = phi <5 x double> [zeroinitializer, %entry], [%cast, %if] + store <5 x double> %phi_cast, ptr addrspace(1) %out + ret void +} + +; FUNC-LABEL: {{^}}bitcast_v10f32_to_v5f64: +; SI: ScratchSize: 0 +define amdgpu_kernel void @bitcast_v10f32_to_v5f64(i32 %cond, ptr addrspace(1) %out, <10 x float> %value) { +entry: + %cmp0 = icmp eq i32 %cond, 0 + br i1 %cmp0, label %if, label %end + +if: + %phi_value = phi <10 x float> [zeroinitializer, %entry], [%value, %if] + %cast = bitcast <10 x float> %phi_value to <5 x double> + %cmp1 = icmp eq i32 %cond, 1 + br i1 %cmp1, label %if, label %end + +end: + %phi_cast = phi <5 x double> [zeroinitializer, %entry], [%cast, %if] + store <5 x double> %phi_cast, ptr addrspace(1) %out + ret void +} + +; FUNC-LABEL: {{^}}bitcast_v10i32_to_v5f64: +; SI: ScratchSize: 0 +define amdgpu_kernel void @bitcast_v10i32_to_v5f64(i32 %cond, ptr addrspace(1) %out, <10 x i32> %value) { +entry: + %cmp0 = icmp eq i32 %cond, 0 + br i1 %cmp0, label %if, label %end + +if: + %phi_value = phi <10 x i32> [zeroinitializer, %entry], [%value, %if] + %cast = bitcast <10 x i32> %phi_value to <5 x double> + %cmp1 = icmp eq i32 %cond, 1 + br i1 %cmp1, label %if, label %end + +end: + %phi_cast = phi <5 x double> [zeroinitializer, %entry], [%cast, %if] + store <5 x double> %phi_cast, ptr addrspace(1) %out + ret void +} + +; FUNC-LABEL: {{^}}bitcast_v10f32_to_v5i64: +; SI: ScratchSize: 0 +define amdgpu_kernel void @bitcast_v10f32_to_v5i64(i32 %cond, ptr addrspace(1) %out, <10 x float> %value) { +entry: + %cmp0 = icmp eq i32 %cond, 0 + br i1 %cmp0, label %if, label %end + +if: + %phi_value = phi <10 x float> [zeroinitializer, %entry], [%value, %if] + %cast = bitcast <10 x float> %phi_value to <5 x i64> + %cmp1 = icmp eq i32 %cond, 1 + br i1 %cmp1, label %if, label %end + +end: + %phi_cast = phi <5 x i64> [zeroinitializer, %entry], [%cast, %if] + store <5 x i64> %phi_cast, ptr addrspace(1) %out + ret void +} + +; FUNC-LABEL: {{^}}bitcast_v10i32_to_v5i64: +; SI: ScratchSize: 0 +define amdgpu_kernel void @bitcast_v10i32_to_v5i64(i32 %cond, ptr addrspace(1) %out, <10 x i32> %value) { +entry: + %cmp0 = icmp eq i32 %cond, 0 + br i1 %cmp0, label %if, label %end + +if: + %phi_value = phi <10 x i32> [zeroinitializer, %entry], [%value, %if] + %cast = bitcast <10 x i32> %phi_value to <5 x i64> + %cmp1 = icmp eq i32 %cond, 1 + br i1 %cmp1, label %if, label %end + +end: + %phi_cast = phi <5 x i64> [zeroinitializer, %entry], [%cast, %if] + store <5 x i64> %phi_cast, ptr addrspace(1) %out + ret void +} + +; FUNC-LABEL: {{^}}bitcast_v40i8_to_v5f64: +; SI: ScratchSize: 0 +define amdgpu_kernel void @bitcast_v40i8_to_v5f64(i32 %cond, ptr addrspace(1) %out, <40 x i8> %value) { +entry: + %cmp0 = icmp eq i32 %cond, 0 + br i1 %cmp0, label %if, label %end + +if: + %phi_value = phi <40 x i8> [zeroinitializer, %entry], [%value, %if] + %cast = bitcast <40 x i8> %phi_value to <5 x double> + %cmp1 = icmp eq i32 %cond, 1 + br i1 %cmp1, label %if, label %end + +end: + %phi_cast = phi <5 x double> [zeroinitializer, %entry], [%cast, %if] + store <5 x double> %phi_cast, ptr addrspace(1) %out + ret void +} + +; FUNC-LABEL: {{^}}bitcast_v40i8_to_v5i64: +; SI: ScratchSize: 0 +define amdgpu_kernel void @bitcast_v40i8_to_v5i64(i32 %cond, ptr addrspace(1) %out, <40 x i8> %value) { +entry: + %cmp0 = icmp eq i32 %cond, 0 + br i1 %cmp0, label %if, label %end + +if: + %phi_value = phi <40 x i8> [zeroinitializer, %entry], [%value, %if] + %cast = bitcast <40 x i8> %phi_value to <5 x i64> + %cmp1 = icmp eq i32 %cond, 1 + br i1 %cmp1, label %if, label %end + +end: + %phi_cast = phi <5 x i64> [zeroinitializer, %entry], [%cast, %if] + store <5 x i64> %phi_cast, ptr addrspace(1) %out + ret void +} + +; FUNC-LABEL: {{^}}bitcast_v5f64_to_v10f32: +; SI: ScratchSize: 0 +define amdgpu_kernel void @bitcast_v5f64_to_v10f32(i32 %cond, ptr addrspace(1) %out, <5 x double> %value) { +entry: + %cmp0 = icmp eq i32 %cond, 0 + br i1 %cmp0, label %if, label %end + +if: + %phi_value = phi <5 x double> [zeroinitializer, %entry], [%value, %if] + %cast = bitcast <5 x double> %phi_value to <10 x float> + %cmp1 = icmp eq i32 %cond, 1 + br i1 %cmp1, label %if, label %end + +end: + %phi_cast = phi <10 x float> [zeroinitializer, %entry], [%cast, %if] + store <10 x float> %phi_cast, ptr addrspace(1) %out + ret void +} + +; FUNC-LABEL: {{^}}bitcast_v5f64_to_v10i32: +; SI: ScratchSize: 0 +define amdgpu_kernel void @bitcast_v5f64_to_v10i32(i32 %cond, ptr addrspace(1) %out, <5 x double> %value) { +entry: + %cmp0 = icmp eq i32 %cond, 0 + br i1 %cmp0, label %if, label %end + +if: + %phi_value = phi <5 x double> [zeroinitializer, %entry], [%value, %if] + %cast = bitcast <5 x double> %phi_value to <10 x i32> + %cmp1 = icmp eq i32 %cond, 1 + br i1 %cmp1, label %if, label %end + +end: + %phi_cast = phi <10 x i32> [zeroinitializer, %entry], [%cast, %if] + store <10 x i32> %phi_cast, ptr addrspace(1) %out + ret void +} + +; FUNC-LABEL: {{^}}bitcast_v5i64_to_v10f32: +; SI: ScratchSize: 0 +define amdgpu_kernel void @bitcast_v5i64_to_v10f32(i32 %cond, ptr addrspace(1) %out, <5 x i64> %value) { +entry: + %cmp0 = icmp eq i32 %cond, 0 + br i1 %cmp0, label %if, label %end + +if: + %phi_value = phi <5 x i64> [zeroinitializer, %entry], [%value, %if] + %cast = bitcast <5 x i64> %phi_value to <10 x float> + %cmp1 = icmp eq i32 %cond, 1 + br i1 %cmp1, label %if, label %end + +end: + %phi_cast = phi <10 x float> [zeroinitializer, %entry], [%cast, %if] + store <10 x float> %phi_cast, ptr addrspace(1) %out + ret void +} + +; FUNC-LABEL: {{^}}bitcast_v5i64_to_v10i32: +; SI: ScratchSize: 0 +define amdgpu_kernel void @bitcast_v5i64_to_v10i32(i32 %cond, ptr addrspace(1) %out, <5 x i64> %value) { +entry: + %cmp0 = icmp eq i32 %cond, 0 + br i1 %cmp0, label %if, label %end + +if: + %phi_value = phi <5 x i64> [zeroinitializer, %entry], [%value, %if] + %cast = bitcast <5 x i64> %phi_value to <10 x i32> + %cmp1 = icmp eq i32 %cond, 1 + br i1 %cmp1, label %if, label %end + +end: + %phi_cast = phi <10 x i32> [zeroinitializer, %entry], [%cast, %if] + store <10 x i32> %phi_cast, ptr addrspace(1) %out + ret void +} + +; FUNC-LABEL: {{^}}bitcast_v6f64_to_v12i32: +; SI: ScratchSize: 0 +define amdgpu_kernel void @bitcast_v6f64_to_v12i32(i32 %cond, ptr addrspace(1) %out, <6 x double> %value) { +entry: + %cmp0 = icmp eq i32 %cond, 0 + br i1 %cmp0, label %if, label %end + +if: + %phi_value = phi <6 x double> [zeroinitializer, %entry], [%value, %if] + %cast = bitcast <6 x double> %phi_value to <12 x i32> + %cmp1 = icmp eq i32 %cond, 1 + br i1 %cmp1, label %if, label %end + +end: + %phi_cast = phi <12 x i32> [zeroinitializer, %entry], [%cast, %if] + store <12 x i32> %phi_cast, ptr addrspace(1) %out + ret void +} + +; FUNC-LABEL: {{^}}bitcast_v6f64_to_v12f32: +; SI: ScratchSize: 0 +define amdgpu_kernel void @bitcast_v6f64_to_v12f32(i32 %cond, ptr addrspace(1) %out, <6 x double> %value) { +entry: + %cmp0 = icmp eq i32 %cond, 0 + br i1 %cmp0, label %if, label %end + +if: + %phi_value = phi <6 x double> [zeroinitializer, %entry], [%value, %if] + %cast = bitcast <6 x double> %phi_value to <12 x float> + %cmp1 = icmp eq i32 %cond, 1 + br i1 %cmp1, label %if, label %end + +end: + %phi_cast = phi <12 x float> [zeroinitializer, %entry], [%cast, %if] + store <12 x float> %phi_cast, ptr addrspace(1) %out + ret void +} + +; FUNC-LABEL: {{^}}bitcast_v12i32_to_v6i64: +; SI: ScratchSize: 0 +define amdgpu_kernel void @bitcast_v12i32_to_v6i64(i32 %cond, ptr addrspace(1) %out, <12 x i32> %value) { +entry: + %cmp0 = icmp eq i32 %cond, 0 + br i1 %cmp0, label %if, label %end + +if: + %phi_value = phi <12 x i32> [zeroinitializer, %entry], [%value, %if] + %cast = bitcast <12 x i32> %phi_value to <6 x i64> + %cmp1 = icmp eq i32 %cond, 1 + br i1 %cmp1, label %if, label %end + +end: + %phi_cast = phi <6 x i64> [zeroinitializer, %entry], [%cast, %if] + store <6 x i64> %phi_cast, ptr addrspace(1) %out + ret void +} + +; FUNC-LABEL: {{^}}bitcast_v12i32_to_v6f64: +; SI: ScratchSize: 0 +define amdgpu_kernel void @bitcast_v12i32_to_v6f64(i32 %cond, ptr addrspace(1) %out, <12 x i32> %value) { +entry: + %cmp0 = icmp eq i32 %cond, 0 + br i1 %cmp0, label %if, label %end + +if: + %phi_value = phi <12 x i32> [zeroinitializer, %entry], [%value, %if] + %cast = bitcast <12 x i32> %phi_value to <6 x double> + %cmp1 = icmp eq i32 %cond, 1 + br i1 %cmp1, label %if, label %end + +end: + %phi_cast = phi <6 x double> [zeroinitializer, %entry], [%cast, %if] + store <6 x double> %phi_cast, ptr addrspace(1) %out + ret void +} + +; FUNC-LABEL: {{^}}bitcast_v6i64_to_v12i32: +; SI: ScratchSize: 0 +define amdgpu_kernel void @bitcast_v6i64_to_v12i32(i32 %cond, ptr addrspace(1) %out, <6 x i64> %value) { +entry: + %cmp0 = icmp eq i32 %cond, 0 + br i1 %cmp0, label %if, label %end + +if: + %phi_value = phi <6 x i64> [zeroinitializer, %entry], [%value, %if] + %cast = bitcast <6 x i64> %phi_value to <12 x i32> + %cmp1 = icmp eq i32 %cond, 1 + br i1 %cmp1, label %if, label %end + +end: + %phi_cast = phi <12 x i32> [zeroinitializer, %entry], [%cast, %if] + store <12 x i32> %phi_cast, ptr addrspace(1) %out + ret void +} + +; FUNC-LABEL: {{^}}bitcast_v7i64_to_v14i32: +; SI: ScratchSize: 0 +define amdgpu_kernel void @bitcast_v7i64_to_v14i32(i32 %cond, ptr addrspace(1) %out, <7 x i64> %value) { +entry: + %cmp0 = icmp eq i32 %cond, 0 + br i1 %cmp0, label %if, label %end + +if: + %phi_value = phi <7 x i64> [zeroinitializer, %entry], [%value, %if] + %cast = bitcast <7 x i64> %phi_value to <14 x i32> + %cmp1 = icmp eq i32 %cond, 1 + br i1 %cmp1, label %if, label %end + +end: + %phi_cast = phi <14 x i32> [zeroinitializer, %entry], [%cast, %if] + store <14 x i32> %phi_cast, ptr addrspace(1) %out + ret void +} + +; FUNC-LABEL: {{^}}bitcast_v7f64_to_v14i32: +; SI: ScratchSize: 0 +define amdgpu_kernel void @bitcast_v7f64_to_v14i32(i32 %cond, ptr addrspace(1) %out, <7 x double> %value) { +entry: + %cmp0 = icmp eq i32 %cond, 0 + br i1 %cmp0, label %if, label %end + +if: + %phi_value = phi <7 x double> [zeroinitializer, %entry], [%value, %if] + %cast = bitcast <7 x double> %phi_value to <14 x i32> + %cmp1 = icmp eq i32 %cond, 1 + br i1 %cmp1, label %if, label %end + +end: + %phi_cast = phi <14 x i32> [zeroinitializer, %entry], [%cast, %if] + store <14 x i32> %phi_cast, ptr addrspace(1) %out + ret void +} + +; FUNC-LABEL: {{^}}bitcast_v9i64_to_v18i32: +; SI: ScratchSize: 0 +define amdgpu_kernel void @bitcast_v9i64_to_v18i32(i32 %cond, ptr addrspace(1) %out, <9 x i64> %value) { +entry: + %cmp0 = icmp eq i32 %cond, 0 + br i1 %cmp0, label %if, label %end + +if: + %phi_value = phi <9 x i64> [zeroinitializer, %entry], [%value, %if] + %cast = bitcast <9 x i64> %phi_value to <18 x i32> + %cmp1 = icmp eq i32 %cond, 1 + br i1 %cmp1, label %if, label %end + +end: + %phi_cast = phi <18 x i32> [zeroinitializer, %entry], [%cast, %if] + store <18 x i32> %phi_cast, ptr addrspace(1) %out + ret void +} + +; FUNC-LABEL: {{^}}bitcast_v10i64_to_v20i32: +; SI: ScratchSize: 0 +define amdgpu_kernel void @bitcast_v10i64_to_v20i32(i32 %cond, ptr addrspace(1) %out, <10 x i64> %value) { +entry: + %cmp0 = icmp eq i32 %cond, 0 + br i1 %cmp0, label %if, label %end + +if: + %phi_value = phi <10 x i64> [zeroinitializer, %entry], [%value, %if] + %cast = bitcast <10 x i64> %phi_value to <20 x i32> + %cmp1 = icmp eq i32 %cond, 1 + br i1 %cmp1, label %if, label %end + +end: + %phi_cast = phi <20 x i32> [zeroinitializer, %entry], [%cast, %if] + store <20 x i32> %phi_cast, ptr addrspace(1) %out + ret void +} + +; FUNC-LABEL: {{^}}bitcast_v11i64_to_v20i32: +; SI: ScratchSize: 0 +define amdgpu_kernel void @bitcast_v11i64_to_v20i32(i32 %cond, ptr addrspace(1) %out, <11 x i64> %value) { +entry: + %cmp0 = icmp eq i32 %cond, 0 + br i1 %cmp0, label %if, label %end + +if: + %phi_value = phi <11 x i64> [zeroinitializer, %entry], [%value, %if] + %cast = bitcast <11 x i64> %phi_value to <22 x i32> + %cmp1 = icmp eq i32 %cond, 1 + br i1 %cmp1, label %if, label %end + +end: + %phi_cast = phi <22 x i32> [zeroinitializer, %entry], [%cast, %if] + store <22 x i32> %phi_cast, ptr addrspace(1) %out + ret void +} + +; FUNC-LABEL: {{^}}bitcast_v12i64_to_v22i32: +; SI: ScratchSize: 0 +define amdgpu_kernel void @bitcast_v12i64_to_v22i32(i32 %cond, ptr addrspace(1) %out, <12 x i64> %value) { +entry: + %cmp0 = icmp eq i32 %cond, 0 + br i1 %cmp0, label %if, label %end + +if: + %phi_value = phi <12 x i64> [zeroinitializer, %entry], [%value, %if] + %cast = bitcast <12 x i64> %phi_value to <24 x i32> + %cmp1 = icmp eq i32 %cond, 1 + br i1 %cmp1, label %if, label %end + +end: + %phi_cast = phi <24 x i32> [zeroinitializer, %entry], [%cast, %if] + store <24 x i32> %phi_cast, ptr addrspace(1) %out + ret void +} + +; FUNC-LABEL: {{^}}bitcast_v13i64_to_v24i32: +; SI: ScratchSize: 0 +define amdgpu_kernel void @bitcast_v13i64_to_v24i32(i32 %cond, ptr addrspace(1) %out, <13 x i64> %value) { +entry: + %cmp0 = icmp eq i32 %cond, 0 + br i1 %cmp0, label %if, label %end + +if: + %phi_value = phi <13 x i64> [zeroinitializer, %entry], [%value, %if] + %cast = bitcast <13 x i64> %phi_value to <26 x i32> + %cmp1 = icmp eq i32 %cond, 1 + br i1 %cmp1, label %if, label %end + +end: + %phi_cast = phi <26 x i32> [zeroinitializer, %entry], [%cast, %if] + store <26 x i32> %phi_cast, ptr addrspace(1) %out + ret void +} + +; FUNC-LABEL: {{^}}bitcast_v14i64_to_v26i32: +; SI: ScratchSize: 0 +define amdgpu_kernel void @bitcast_v14i64_to_v26i32(i32 %cond, ptr addrspace(1) %out, <14 x i64> %value) { +entry: + %cmp0 = icmp eq i32 %cond, 0 + br i1 %cmp0, label %if, label %end + +if: + %phi_value = phi <14 x i64> [zeroinitializer, %entry], [%value, %if] + %cast = bitcast <14 x i64> %phi_value to <28 x i32> + %cmp1 = icmp eq i32 %cond, 1 + br i1 %cmp1, label %if, label %end + +end: + %phi_cast = phi <28 x i32> [zeroinitializer, %entry], [%cast, %if] + store <28 x i32> %phi_cast, ptr addrspace(1) %out + ret void +} + +; FUNC-LABEL: {{^}}bitcast_v15i64_to_v26i32: +; SI: ScratchSize: 0 +define amdgpu_kernel void @bitcast_v15i64_to_v26i32(i32 %cond, ptr addrspace(1) %out, <15 x i64> %value) { +entry: + %cmp0 = icmp eq i32 %cond, 0 + br i1 %cmp0, label %if, label %end + +if: + %phi_value = phi <15 x i64> [zeroinitializer, %entry], [%value, %if] + %cast = bitcast <15 x i64> %phi_value to <30 x i32> + %cmp1 = icmp eq i32 %cond, 1 + br i1 %cmp1, label %if, label %end + +end: + %phi_cast = phi <30 x i32> [zeroinitializer, %entry], [%cast, %if] + store <30 x i32> %phi_cast, ptr addrspace(1) %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll b/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll index b006c6b2aff7ab..3145c1c3e868bc 100644 --- a/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll +++ b/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll @@ -171,20 +171,14 @@ define amdgpu_kernel void @test_copy_v4i8_x4(ptr addrspace(1) %out0, ptr addrspa ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_mov_b32 s24, SCRATCH_RSRC_DWORD0 -; SI-NEXT: s_mov_b32 s25, SCRATCH_RSRC_DWORD1 -; SI-NEXT: s_mov_b32 s26, -1 -; SI-NEXT: s_mov_b32 s27, 0xe8f000 -; SI-NEXT: s_add_u32 s24, s24, s3 ; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_addc_u32 s25, s25, 0 ; SI-NEXT: s_mov_b32 s14, s10 ; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_mov_b32 s18, s10 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 -; SI-NEXT: s_mov_b32 s18, s10 ; SI-NEXT: s_mov_b32 s19, s11 ; SI-NEXT: s_mov_b32 s22, s10 ; SI-NEXT: s_mov_b32 s23, s11 @@ -203,30 +197,24 @@ define amdgpu_kernel void @test_copy_v4i8_x4(ptr addrspace(1) %out0, ptr addrspa ; ; VI-LABEL: test_copy_v4i8_x4: ; VI: ; %bb.0: -; VI-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 -; VI-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 -; VI-NEXT: s_mov_b32 s90, -1 -; VI-NEXT: s_mov_b32 s91, 0xe80000 -; VI-NEXT: s_add_u32 s88, s88, s3 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x44 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_mov_b32 s11, 0xf000 ; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_addc_u32 s89, s89, 0 +; VI-NEXT: s_mov_b32 s14, s10 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s14, s10 ; VI-NEXT: s_mov_b32 s15, s11 ; VI-NEXT: s_mov_b32 s18, s10 ; VI-NEXT: s_mov_b32 s19, s11 +; VI-NEXT: s_mov_b32 s22, s10 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s8, s0 ; VI-NEXT: s_mov_b32 s9, s1 -; VI-NEXT: s_mov_b32 s22, s10 ; VI-NEXT: s_mov_b32 s23, s11 ; VI-NEXT: s_mov_b32 s12, s2 ; VI-NEXT: s_mov_b32 s13, s3 diff --git a/llvm/test/CodeGen/AMDGPU/copy-to-reg-scc-clobber.ll b/llvm/test/CodeGen/AMDGPU/copy-to-reg-scc-clobber.ll index 2b4651487eff6c..2b8a712b28c054 100644 --- a/llvm/test/CodeGen/AMDGPU/copy-to-reg-scc-clobber.ll +++ b/llvm/test/CodeGen/AMDGPU/copy-to-reg-scc-clobber.ll @@ -7,27 +7,21 @@ define protected amdgpu_kernel void @sccClobber(ptr addrspace(1) %a, ptr addrspa ; RRLIST-LABEL: sccClobber: ; RRLIST: ; %bb.0: ; %entry ; RRLIST-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 -; RRLIST-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 -; RRLIST-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 -; RRLIST-NEXT: s_mov_b32 s22, -1 -; RRLIST-NEXT: s_mov_b32 s23, 0xe00000 -; RRLIST-NEXT: s_add_u32 s20, s20, s3 +; RRLIST-NEXT: v_mov_b32_e32 v2, 0 ; RRLIST-NEXT: s_waitcnt lgkmcnt(0) ; RRLIST-NEXT: s_load_dword s16, s[8:9], 0x0 ; RRLIST-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 ; RRLIST-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x0 ; RRLIST-NEXT: s_load_dwordx2 s[14:15], s[0:1], 0x44 ; RRLIST-NEXT: s_load_dword s17, s[10:11], 0x0 -; RRLIST-NEXT: s_addc_u32 s21, s21, 0 ; RRLIST-NEXT: s_waitcnt lgkmcnt(0) -; RRLIST-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; RRLIST-NEXT: s_min_i32 s4, s16, 0 +; RRLIST-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; RRLIST-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[0:1] ; RRLIST-NEXT: s_and_b64 s[0:1], vcc, exec ; RRLIST-NEXT: s_cselect_b32 s0, s16, s17 ; RRLIST-NEXT: s_cmp_eq_u64 s[12:13], s[2:3] ; RRLIST-NEXT: s_cselect_b32 s0, s4, s0 -; RRLIST-NEXT: v_mov_b32_e32 v2, 0 ; RRLIST-NEXT: v_mov_b32_e32 v0, s0 ; RRLIST-NEXT: global_store_dword v2, v0, s[14:15] ; RRLIST-NEXT: s_endpgm @@ -35,27 +29,21 @@ define protected amdgpu_kernel void @sccClobber(ptr addrspace(1) %a, ptr addrspa ; FAST-LABEL: sccClobber: ; FAST: ; %bb.0: ; %entry ; FAST-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 -; FAST-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 -; FAST-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 -; FAST-NEXT: s_mov_b32 s22, -1 -; FAST-NEXT: s_mov_b32 s23, 0xe00000 -; FAST-NEXT: s_add_u32 s20, s20, s3 +; FAST-NEXT: v_mov_b32_e32 v2, 0 ; FAST-NEXT: s_waitcnt lgkmcnt(0) ; FAST-NEXT: s_load_dword s16, s[8:9], 0x0 ; FAST-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 ; FAST-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x0 ; FAST-NEXT: s_load_dwordx2 s[14:15], s[0:1], 0x44 ; FAST-NEXT: s_load_dword s17, s[10:11], 0x0 -; FAST-NEXT: s_addc_u32 s21, s21, 0 ; FAST-NEXT: s_waitcnt lgkmcnt(0) -; FAST-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; FAST-NEXT: s_min_i32 s4, s16, 0 +; FAST-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; FAST-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[0:1] ; FAST-NEXT: s_and_b64 s[0:1], vcc, exec ; FAST-NEXT: s_cselect_b32 s0, s16, s17 ; FAST-NEXT: s_cmp_eq_u64 s[12:13], s[2:3] ; FAST-NEXT: s_cselect_b32 s0, s4, s0 -; FAST-NEXT: v_mov_b32_e32 v2, 0 ; FAST-NEXT: v_mov_b32_e32 v0, s0 ; FAST-NEXT: global_store_dword v2, v0, s[14:15] ; FAST-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll index 41a4998b3ba918..027f3c360b426b 100644 --- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll +++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll @@ -4032,14 +4032,8 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index, i64 %old) { ; GCN1-LABEL: atomic_cmpxchg_i64_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GCN1-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x9 ; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x11 -; GCN1-NEXT: s_mov_b32 s14, -1 -; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN1-NEXT: s_add_u32 s12, s12, s3 -; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[2:3], s[10:11], 3 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 @@ -4063,14 +4057,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr %out, ptr %o ; ; GCN2-LABEL: atomic_cmpxchg_i64_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 -; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; GCN2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 ; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44 -; GCN2-NEXT: s_mov_b32 s90, -1 -; GCN2-NEXT: s_mov_b32 s91, 0xe80000 -; GCN2-NEXT: s_add_u32 s88, s88, s3 -; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[2:3], s[10:11], 3 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 @@ -4231,14 +4219,8 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index, i64 %old) { ; GCN1-LABEL: atomic_cmpxchg_i64_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GCN1-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x9 ; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x11 -; GCN1-NEXT: s_mov_b32 s14, -1 -; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN1-NEXT: s_add_u32 s12, s12, s3 -; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[2:3], s[10:11], 3 ; GCN1-NEXT: s_add_u32 s2, s4, s2 @@ -4260,14 +4242,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr %out, ptr %out2, i6 ; ; GCN2-LABEL: atomic_cmpxchg_i64_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 -; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; GCN2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 ; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44 -; GCN2-NEXT: s_mov_b32 s90, -1 -; GCN2-NEXT: s_mov_b32 s91, 0xe80000 -; GCN2-NEXT: s_add_u32 s88, s88, s3 -; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[2:3], s[10:11], 3 ; GCN2-NEXT: s_add_u32 s2, s4, s2 diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll index 48aa210665e79e..d295efc6d015f7 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll @@ -5010,17 +5010,12 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index, i64 %old) { ; CI-LABEL: atomic_cmpxchg_i64_ret_addr64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0 ; CI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x9 -; CI-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1 -; CI-NEXT: s_mov_b32 s18, -1 ; CI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x11 -; CI-NEXT: s_mov_b32 s19, 0xe8f000 -; CI-NEXT: s_add_u32 s16, s16, s3 -; CI-NEXT: s_addc_u32 s17, s17, 0 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshl_b64 s[10:11], s[10:11], 3 -; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: v_mov_b32_e32 v4, s10 ; CI-NEXT: s_mov_b32 s0, s6 ; CI-NEXT: s_mov_b32 s1, s7 @@ -5035,20 +5030,13 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr addrspace(1) ; CI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 offset:32 glc ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol -; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; CI-NEXT: s_endpgm ; ; VI-LABEL: atomic_cmpxchg_i64_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 -; VI-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44 -; VI-NEXT: s_mov_b32 s90, -1 -; VI-NEXT: s_mov_b32 s91, 0xe80000 -; VI-NEXT: s_add_u32 s88, s88, s3 -; VI-NEXT: s_addc_u32 s89, s89, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b64 s[2:3], s[10:11], 3 ; VI-NEXT: v_mov_b32_e32 v2, s0 @@ -5074,18 +5062,12 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr addrspace(1) ; ; GFX9-LABEL: atomic_cmpxchg_i64_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s14, -1 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44 -; GFX9-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-NEXT: s_add_u32 s12, s12, s3 -; GFX9-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[2:3], s[10:11], 3 ; GFX9-NEXT: s_add_u32 s2, s4, s2 -; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, s8 ; GFX9-NEXT: v_mov_b32_e32 v1, s9 ; GFX9-NEXT: s_addc_u32 s3, s5, s3 @@ -5296,17 +5278,12 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index, i64 %old) { ; CI-LABEL: atomic_cmpxchg_i64_ret_addr64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0 ; CI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x9 -; CI-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1 -; CI-NEXT: s_mov_b32 s18, -1 ; CI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x11 -; CI-NEXT: s_mov_b32 s19, 0xe8f000 -; CI-NEXT: s_add_u32 s16, s16, s3 -; CI-NEXT: s_addc_u32 s17, s17, 0 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshl_b64 s[10:11], s[10:11], 3 -; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: v_mov_b32_e32 v4, s10 ; CI-NEXT: s_mov_b32 s0, s6 ; CI-NEXT: s_mov_b32 s1, s7 @@ -5321,20 +5298,13 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr addrspace(1) %out, ; CI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 glc ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol -; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; CI-NEXT: s_endpgm ; ; VI-LABEL: atomic_cmpxchg_i64_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 -; VI-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44 -; VI-NEXT: s_mov_b32 s90, -1 -; VI-NEXT: s_mov_b32 s91, 0xe80000 -; VI-NEXT: s_add_u32 s88, s88, s3 -; VI-NEXT: s_addc_u32 s89, s89, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b64 s[2:3], s[10:11], 3 ; VI-NEXT: s_add_u32 s2, s4, s2 @@ -5358,18 +5328,12 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr addrspace(1) %out, ; ; GFX9-LABEL: atomic_cmpxchg_i64_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s14, -1 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44 -; GFX9-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-NEXT: s_add_u32 s12, s12, s3 -; GFX9-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[2:3], s[10:11], 3 ; GFX9-NEXT: s_add_u32 s2, s4, s2 -; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, s8 ; GFX9-NEXT: v_mov_b32_e32 v1, s9 ; GFX9-NEXT: s_addc_u32 s3, s5, s3 diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll index 032b8b89fb4eed..03e1960ca7c6aa 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll @@ -610,16 +610,10 @@ entry: define amdgpu_kernel void @double5_inselt(ptr addrspace(1) %out, <5 x double> %vec, i32 %sel) { ; GCN-LABEL: double5_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0 -; GCN-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1 -; GCN-NEXT: s_mov_b32 s18, -1 -; GCN-NEXT: s_mov_b32 s19, 0xe80000 -; GCN-NEXT: s_add_u32 s16, s16, s3 ; GCN-NEXT: s_load_dword s12, s[0:1], 0xa4 ; GCN-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x84 ; GCN-NEXT: s_load_dwordx2 s[10:11], s[0:1], 0x24 ; GCN-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x64 -; GCN-NEXT: s_addc_u32 s17, s17, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_cmp_eq_u32 s12, 4 ; GCN-NEXT: s_cselect_b32 s9, 0x3ff00000, s9 diff --git a/llvm/test/CodeGen/AMDGPU/select.f16.ll b/llvm/test/CodeGen/AMDGPU/select.f16.ll index abee0b2d9c5b45..be01fa8ab1e635 100644 --- a/llvm/test/CodeGen/AMDGPU/select.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/select.f16.ll @@ -5,19 +5,14 @@ define amdgpu_kernel void @select_f16( ; SI-LABEL: select_f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_mov_b32 s24, SCRATCH_RSRC_DWORD0 ; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s25, SCRATCH_RSRC_DWORD1 -; SI-NEXT: s_mov_b32 s26, -1 -; SI-NEXT: s_mov_b32 s27, 0xe8f000 -; SI-NEXT: s_add_u32 s24, s24, s3 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x11 +; SI-NEXT: s_mov_b32 s18, s2 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s16, s6 ; SI-NEXT: s_mov_b32 s17, s7 -; SI-NEXT: s_mov_b32 s18, s2 ; SI-NEXT: s_mov_b32 s19, s3 ; SI-NEXT: s_mov_b32 s20, s8 ; SI-NEXT: s_mov_b32 s21, s9 @@ -39,7 +34,6 @@ define amdgpu_kernel void @select_f16( ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_mov_b32 s0, s4 ; SI-NEXT: s_mov_b32 s1, s5 -; SI-NEXT: s_addc_u32 s25, s25, 0 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 @@ -52,19 +46,14 @@ define amdgpu_kernel void @select_f16( ; ; VI-LABEL: select_f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_mov_b32 s24, SCRATCH_RSRC_DWORD0 ; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s25, SCRATCH_RSRC_DWORD1 -; VI-NEXT: s_mov_b32 s26, -1 -; VI-NEXT: s_mov_b32 s27, 0xe80000 -; VI-NEXT: s_add_u32 s24, s24, s3 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x44 +; VI-NEXT: s_mov_b32 s18, s2 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s16, s6 ; VI-NEXT: s_mov_b32 s17, s7 -; VI-NEXT: s_mov_b32 s18, s2 ; VI-NEXT: s_mov_b32 s19, s3 ; VI-NEXT: s_mov_b32 s20, s8 ; VI-NEXT: s_mov_b32 s21, s9 @@ -86,7 +75,6 @@ define amdgpu_kernel void @select_f16( ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_mov_b32 s0, s4 ; VI-NEXT: s_mov_b32 s1, s5 -; VI-NEXT: s_addc_u32 s25, s25, 0 ; VI-NEXT: v_cmp_lt_f16_e32 vcc, v0, v1 ; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 @@ -432,19 +420,14 @@ entry: define amdgpu_kernel void @select_v2f16( ; SI-LABEL: select_v2f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_mov_b32 s24, SCRATCH_RSRC_DWORD0 ; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s25, SCRATCH_RSRC_DWORD1 -; SI-NEXT: s_mov_b32 s26, -1 ; SI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x11 -; SI-NEXT: s_mov_b32 s27, 0xe8f000 -; SI-NEXT: s_add_u32 s24, s24, s3 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s18, s2 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s16, s6 ; SI-NEXT: s_mov_b32 s17, s7 -; SI-NEXT: s_mov_b32 s18, s2 ; SI-NEXT: s_mov_b32 s19, s3 ; SI-NEXT: s_mov_b32 s20, s8 ; SI-NEXT: s_mov_b32 s21, s9 @@ -462,7 +445,6 @@ define amdgpu_kernel void @select_v2f16( ; SI-NEXT: buffer_load_dword v3, off, s[8:11], 0 ; SI-NEXT: s_mov_b32 s0, s4 ; SI-NEXT: s_mov_b32 s1, s5 -; SI-NEXT: s_addc_u32 s25, s25, 0 ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 @@ -492,13 +474,8 @@ define amdgpu_kernel void @select_v2f16( ; ; VI-LABEL: select_v2f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_mov_b32 s24, SCRATCH_RSRC_DWORD0 -; VI-NEXT: s_mov_b32 s25, SCRATCH_RSRC_DWORD1 ; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 ; VI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x44 -; VI-NEXT: s_mov_b32 s26, -1 -; VI-NEXT: s_mov_b32 s27, 0xe80000 -; VI-NEXT: s_add_u32 s24, s24, s3 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s14, s2 @@ -522,7 +499,6 @@ define amdgpu_kernel void @select_v2f16( ; VI-NEXT: buffer_load_dword v3, off, s[8:11], 0 ; VI-NEXT: s_mov_b32 s0, s4 ; VI-NEXT: s_mov_b32 s1, s5 -; VI-NEXT: s_addc_u32 s25, s25, 0 ; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 ; VI-NEXT: s_waitcnt vmcnt(2) diff --git a/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll b/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll index 433368bf616eb5..9b92e03a01c117 100644 --- a/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll +++ b/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll @@ -67,19 +67,14 @@ entry: define amdgpu_kernel void @madak_f16_use_2( ; SI-LABEL: madak_f16_use_2: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 ; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 -; SI-NEXT: s_mov_b32 s22, -1 -; SI-NEXT: s_mov_b32 s23, 0xe8f000 ; SI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x11 -; SI-NEXT: s_add_u32 s20, s20, s3 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s18, s2 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s16, s8 ; SI-NEXT: s_mov_b32 s17, s9 -; SI-NEXT: s_mov_b32 s18, s2 ; SI-NEXT: s_mov_b32 s19, s3 ; SI-NEXT: s_mov_b32 s8, s10 ; SI-NEXT: s_mov_b32 s9, s11 @@ -96,7 +91,6 @@ define amdgpu_kernel void @madak_f16_use_2( ; SI-NEXT: v_mov_b32_e32 v3, 0x41200000 ; SI-NEXT: s_mov_b32 s0, s4 ; SI-NEXT: s_mov_b32 s1, s5 -; SI-NEXT: s_addc_u32 s21, s21, 0 ; SI-NEXT: s_mov_b32 s8, s6 ; SI-NEXT: s_mov_b32 s9, s7 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 @@ -112,19 +106,14 @@ define amdgpu_kernel void @madak_f16_use_2( ; ; VI-LABEL: madak_f16_use_2: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 ; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 -; VI-NEXT: s_mov_b32 s22, -1 -; VI-NEXT: s_mov_b32 s23, 0xe80000 ; VI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x44 -; VI-NEXT: s_add_u32 s20, s20, s3 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s18, s2 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s16, s8 ; VI-NEXT: s_mov_b32 s17, s9 -; VI-NEXT: s_mov_b32 s18, s2 ; VI-NEXT: s_mov_b32 s19, s3 ; VI-NEXT: s_mov_b32 s8, s10 ; VI-NEXT: s_mov_b32 s9, s11 @@ -141,7 +130,6 @@ define amdgpu_kernel void @madak_f16_use_2( ; VI-NEXT: v_mov_b32_e32 v3, 0x4900 ; VI-NEXT: s_mov_b32 s0, s4 ; VI-NEXT: s_mov_b32 s1, s5 -; VI-NEXT: s_addc_u32 s21, s21, 0 ; VI-NEXT: s_mov_b32 s8, s6 ; VI-NEXT: s_mov_b32 s9, s7 ; VI-NEXT: v_madak_f16 v1, v0, v1, 0x4900 diff --git a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll index 584d6354840a94..e6bc773c272bdd 100644 --- a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll +++ b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll @@ -1660,19 +1660,13 @@ define <6 x half> @shuffle_v6f16_452367(ptr addrspace(1) %arg0, ptr addrspace(1) ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_load_b96 v[4:6], v[0:1], off -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: scratch_store_b96 off, v[4:6], s32 -; GFX11-NEXT: global_load_b96 v[4:6], v[2:3], off -; GFX11-NEXT: scratch_load_b128 v[0:3], off, s32 -; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: scratch_store_b96 off, v[4:6], s32 offset:16 -; GFX11-NEXT: scratch_load_b32 v3, off, s32 offset:16 +; GFX11-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_mov_b32 v3, v2 +; GFX11-NEXT: global_load_b96 v[0:2], v[0:1], off +; GFX11-NEXT: global_load_b32 v3, v[3:4], off ; GFX11-NEXT: s_waitcnt vmcnt(1) ; GFX11-NEXT: v_mov_b32_e32 v0, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v2, v3 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %val0 = load <6 x half>, ptr addrspace(1) %arg0 %val1 = load <6 x half>, ptr addrspace(1) %arg1