[SelectionDAG] Widening the result of INSERT_SUBVECTOR.

Widens the result and first input vector because they have the same size. The subvector to be inserted is widened in the operand widen function. Differential Revision: https://reviews.llvm.org/D112187
llvm · Oct 27, 2021 · 1137b72 · 1137b72
1 parent fbc0c30
commit 1137b72
Show file tree

Hide file tree

Showing 3 changed files with 102 additions and 0 deletions.
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -908,6 +908,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
   SDValue WidenVecRes_CONCAT_VECTORS(SDNode* N);
   SDValue WidenVecRes_EXTEND_VECTOR_INREG(SDNode* N);
   SDValue WidenVecRes_EXTRACT_SUBVECTOR(SDNode* N);
+  SDValue WidenVecRes_INSERT_SUBVECTOR(SDNode *N);
   SDValue WidenVecRes_INSERT_VECTOR_ELT(SDNode* N);
   SDValue WidenVecRes_LOAD(SDNode* N);
   SDValue WidenVecRes_MLOAD(MaskedLoadSDNode* N);

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -3038,6 +3038,9 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::BITCAST:           Res = WidenVecRes_BITCAST(N); break;
   case ISD::BUILD_VECTOR:      Res = WidenVecRes_BUILD_VECTOR(N); break;
   case ISD::CONCAT_VECTORS:    Res = WidenVecRes_CONCAT_VECTORS(N); break;
+  case ISD::INSERT_SUBVECTOR:
+    Res = WidenVecRes_INSERT_SUBVECTOR(N);
+    break;
   case ISD::EXTRACT_SUBVECTOR: Res = WidenVecRes_EXTRACT_SUBVECTOR(N); break;
   case ISD::INSERT_VECTOR_ELT: Res = WidenVecRes_INSERT_VECTOR_ELT(N); break;
   case ISD::LOAD:              Res = WidenVecRes_LOAD(N); break;
@@ -4059,6 +4062,16 @@ SDValue DAGTypeLegalizer::WidenVecRes_CONCAT_VECTORS(SDNode *N) {
   return DAG.getBuildVector(WidenVT, dl, Ops);
 }
 
+SDValue DAGTypeLegalizer::WidenVecRes_INSERT_SUBVECTOR(SDNode *N) {
+  EVT VT = N->getValueType(0);
+  EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
+  SDValue InOp1 = GetWidenedVector(N->getOperand(0));
+  SDValue InOp2 = N->getOperand(1);
+  SDValue Idx = N->getOperand(2);
+  SDLoc dl(N);
+  return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WidenVT, InOp1, InOp2, Idx);
+}
+
 SDValue DAGTypeLegalizer::WidenVecRes_EXTRACT_SUBVECTOR(SDNode *N) {
   EVT      VT = N->getValueType(0);
   EVT      EltVT = VT.getVectorElementType();

diff --git a/llvm/test/CodeGen/AArch64/sve-insert-vector.ll b/llvm/test/CodeGen/AArch64/sve-insert-vector.ll
@@ -399,3 +399,91 @@ declare <vscale x 6 x i16> @llvm.experimental.vector.insert.nxv6i16.nxv1i16(<vsc
 declare <vscale x 8 x i16> @llvm.experimental.vector.insert.nxv8i16.nxv2i16(<vscale x 8 x i16>, <vscale x 2 x i16>, i64)
 
 declare <vscale x 8 x half> @llvm.experimental.vector.insert.nxv8f16.nxv2f16(<vscale x 8 x half>, <vscale x 2 x half>, i64)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;  Upacked types that need result widening
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+define <vscale x 3 x i32> @insert_nxv3i32_nxv2i32(<vscale x 2 x i32> %sv0) {
+; CHECK-LABEL: insert_nxv3i32_nxv2i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uunpkhi z1.d, z0.s
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z1.s
+; CHECK-NEXT:    ret
+  %v0 = call <vscale x 3 x i32> @llvm.experimental.vector.insert.nxv3i32.nxv2i32(<vscale x 3 x i32> undef, <vscale x 2 x i32> %sv0, i64 0)
+ ret <vscale x 3 x i32> %v0
+}
+
+;; Check that the Subvector is not widen so it does not crash.
+define <vscale x 3 x i32> @insert_nxv3i32_nxv2i32_2(<vscale x 3 x i32> %sv0, <vscale x 2 x i32> %sv1) {
+; CHECK-LABEL: insert_nxv3i32_nxv2i32_2:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uunpkhi z0.d, z0.s
+; CHECK-NEXT:    uzp1 z0.s, z1.s, z0.s
+; CHECK-NEXT:    ret
+  %v0 = call <vscale x 3 x i32> @llvm.experimental.vector.insert.nxv3i32.nxv2i32(<vscale x 3 x i32> %sv0, <vscale x 2 x i32> %sv1, i64 0)
+  ret <vscale x 3 x i32> %v0
+}
+
+define <vscale x 3 x float> @insert_nxv3f32_nxv2f32(<vscale x 2 x float> %sv0) nounwind {
+; CHECK-LABEL: insert_nxv3f32_nxv2f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    st1w { z0.d }, p0, [sp]
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [sp]
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %v0 = call <vscale x 3 x float> @llvm.experimental.vector.insert.nxv3f32.nxv2f32(<vscale x 3 x float> undef, <vscale x 2 x float> %sv0, i64 0)
+ ret <vscale x 3 x float> %v0
+}
+
+define <vscale x 6 x i32>  @insert_nxv6i32_nxv2i32(<vscale x 2 x i32> %sv0, <vscale x 2 x i32> %sv1) nounwind {
+; CHECK-LABEL: insert_nxv6i32_nxv2i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-2
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    uunpklo z2.d, z0.s
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z1.s
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ptrue p1.s
+; CHECK-NEXT:    st1w { z2.d }, p0, [x8, #2, mul vl]
+; CHECK-NEXT:    st1w { z0.s }, p1, [sp]
+; CHECK-NEXT:    ld1w { z1.s }, p1/z, [x8, #1, mul vl]
+; CHECK-NEXT:    ld1w { z0.s }, p1/z, [sp]
+; CHECK-NEXT:    addvl sp, sp, #2
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %v0 = call <vscale x 6 x i32> @llvm.experimental.vector.insert.nxv6i32.nxv2i32(<vscale x 6 x i32> undef, <vscale x 2 x i32> %sv0, i64 0)
+  %v1 = call <vscale x 6 x i32> @llvm.experimental.vector.insert.nxv6i32.nxv2i32(<vscale x 6 x i32> %v0, <vscale x 2 x i32> %sv1, i64 2)
+  ret <vscale x 6 x i32> %v1
+}
+
+;; This only works because the input vector is undef and index is zero
+define  <vscale x 6 x i32> @insert_nxv6i32_nxv3i32(<vscale x 3 x i32> %sv0) {
+; CHECK-LABEL: insert_nxv6i32_nxv3i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ret
+  %v0 = call <vscale x 6 x i32> @llvm.experimental.vector.insert.nxv6i32.nxv3i32(<vscale x 6 x i32> undef, <vscale x 3 x i32> %sv0, i64 0)
+  ret <vscale x 6 x i32> %v0
+}
+
+define <vscale x 12 x i32> @insert_nxv12i32_nxv4i32(<vscale x 4 x i32> %sv0, <vscale x 4 x i32> %sv1, <vscale x 4 x i32> %sv2) {
+; CHECK-LABEL: insert_nxv12i32_nxv4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ret
+  %v0 = call <vscale x 12 x i32> @llvm.experimental.vector.insert.nxv4i32.nxv12i32(<vscale x 12 x i32> undef, <vscale x 4 x i32> %sv0, i64 0)
+  %v1 = call <vscale x 12 x i32> @llvm.experimental.vector.insert.nxv4i32.nxv12i32(<vscale x 12 x i32> %v0, <vscale x 4 x i32> %sv1, i64 4)
+  %v2 = call <vscale x 12 x i32> @llvm.experimental.vector.insert.nxv4i32.nxv12i32(<vscale x 12 x i32> %v1, <vscale x 4 x i32> %sv2, i64 8)
+  ret <vscale x 12 x i32> %v2
+}
+
+declare <vscale x 3 x i32> @llvm.experimental.vector.insert.nxv3i32.nxv2i32(<vscale x 3 x i32>, <vscale x 2 x i32>, i64)
+declare <vscale x 3 x float> @llvm.experimental.vector.insert.nxv3f32.nxv2f32(<vscale x 3 x float>, <vscale x 2 x float>, i64)
+declare <vscale x 6 x i32> @llvm.experimental.vector.insert.nxv6i32.nxv2i32(<vscale x 6 x i32>, <vscale x 2 x i32>, i64)
+declare <vscale x 6 x i32> @llvm.experimental.vector.insert.nxv6i32.nxv3i32(<vscale x 6 x i32>, <vscale x 3 x i32>, i64)
+declare <vscale x 12 x i32> @llvm.experimental.vector.insert.nxv4i32.nxv12i32(<vscale x 12 x i32>, <vscale x 4 x i32>, i64)