[AArch64] Teach perfect shuffles tables about D-lane movs

Similar to D123386, this adds D-Movs to the AArch64 perfect shuffle tables, slightly lowering the costs a little more. This is a rough improvement in general, especially if you ignore mov v0.16b, v2.16b type moves that are often artefacts of the calling convention. The D register movs are encoded as (0x4 | LaneIdx), and to generate a D register move we are required to bitcast into a higher type, but it is otherwise very similar to the S-lane mov's already supported. Differential Revision: https://reviews.llvm.org/D125477
llvm · May 17, 2022 · 4c6a070 · 4c6a070
1 parent bd93df9
commit 4c6a070
Show file tree

Hide file tree

Showing 10 changed files with 498 additions and 476 deletions.
diff --git a/clang/test/CodeGen/aarch64-neon-vcmla.c b/clang/test/CodeGen/aarch64-neon-vcmla.c
@@ -148,10 +148,8 @@ float64x2_t test_vcmlaq_rot270_f64(float64x2_t acc, float64x2_t lhs, float64x2_t
 }
 
 // CHECK-LABEL: @test_vcmla_lane_f16(
-// CHECK: [[CPLX:%.*]] = bitcast <4 x half> %rhs to <2 x i32>
-// CHECK: [[DUP:%.*]] = shufflevector <2 x i32> [[CPLX]], <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-// CHECK: [[DUP_FLT:%.*]] = bitcast <2 x i32> [[DUP]] to <4 x half>
-// CHECK: [[RES:%.*]] = call <4 x half> @llvm.aarch64.neon.vcmla.rot0.v4f16(<4 x half> %acc, <4 x half> %lhs, <4 x half> [[DUP_FLT]])
+// CHECK: [[DUP:%.*]] = shufflevector <4 x half> %rhs, <4 x half> poison, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
+// CHECK: [[RES:%.*]] = call <4 x half> @llvm.aarch64.neon.vcmla.rot0.v4f16(<4 x half> %acc, <4 x half> %lhs, <4 x half> [[DUP]])
 // CHECK: ret <4 x half> [[RES]]
 float16x4_t test_vcmla_lane_f16(float16x4_t acc, float16x4_t lhs, float16x4_t rhs) {
   return vcmla_lane_f16(acc, lhs, rhs, 1);
@@ -209,29 +207,25 @@ float32x2_t test_vcmla_laneq_f32(float32x2_t acc, float32x2_t lhs, float32x4_t r
 // CHECK-LABEL: @test_vcmlaq_lane_f32(
 // CHECK: [[CPLX:%.*]] = bitcast <2 x float> %rhs to i64
 // CHECK: [[CPLX_VEC:%.*]] = insertelement <2 x i64> undef, i64 [[CPLX]], i64 0
-// CHECK: [[DUP:%.*]] = shufflevector <2 x i64> [[CPLX_VEC]], <2 x i64> poison, <2 x i32> zeroinitializer
-// CHECK: [[DUP_FLT:%.*]] = bitcast <2 x i64> [[DUP]] to <4 x float>
-// CHECK: [[RES:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot0.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> [[DUP_FLT]])
+// CHECK: [[CPLX2:%.*]] = bitcast <2 x i64> [[CPLX_VEC]] to <4 x float>
+// CHECK: [[DUP:%.*]] = shufflevector <4 x float> [[CPLX2]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+// CHECK: [[RES:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot0.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> [[DUP]])
 // CHECK: ret <4 x float> [[RES]]
 float32x4_t test_vcmlaq_lane_f32(float32x4_t acc, float32x4_t lhs, float32x2_t rhs) {
   return vcmlaq_lane_f32(acc, lhs, rhs, 0);
 }
 
 // CHECK-LABEL: @test_vcmlaq_laneq_f32(
-// CHECK: [[CPLX:%.*]] = bitcast <4 x float> %rhs to <2 x i64>
-// CHECK: [[DUP:%.*]] = shufflevector <2 x i64> [[CPLX]], <2 x i64> undef, <2 x i32> <i32 1, i32 1>
-// CHECK: [[DUP_FLT:%.*]] = bitcast <2 x i64> [[DUP]] to <4 x float>
-// CHECK: [[RES:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot0.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> [[DUP_FLT]])
+// CHECK: [[DUP:%.*]] = shufflevector <4 x float> %rhs, <4 x float> poison, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
+// CHECK: [[RES:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot0.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> [[DUP]])
 // CHECK: ret <4 x float> [[RES]]
 float32x4_t test_vcmlaq_laneq_f32(float32x4_t acc, float32x4_t lhs, float32x4_t rhs) {
   return vcmlaq_laneq_f32(acc, lhs, rhs, 1);
 }
 
 // CHECK-LABEL: @test_vcmla_rot90_lane_f16(
-// CHECK: [[CPLX:%.*]] = bitcast <4 x half> %rhs to <2 x i32>
-// CHECK: [[DUP:%.*]] = shufflevector <2 x i32> [[CPLX]], <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-// CHECK: [[DUP_FLT:%.*]] = bitcast <2 x i32> [[DUP]] to <4 x half>
-// CHECK: [[RES:%.*]] = call <4 x half> @llvm.aarch64.neon.vcmla.rot90.v4f16(<4 x half> %acc, <4 x half> %lhs, <4 x half> [[DUP_FLT]])
+// CHECK: [[DUP:%.*]] = shufflevector <4 x half> %rhs, <4 x half> poison, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
+// CHECK: [[RES:%.*]] = call <4 x half> @llvm.aarch64.neon.vcmla.rot90.v4f16(<4 x half> %acc, <4 x half> %lhs, <4 x half> [[DUP]])
 // CHECK: ret <4 x half> [[RES]]
 float16x4_t test_vcmla_rot90_lane_f16(float16x4_t acc, float16x4_t lhs, float16x4_t rhs) {
   return vcmla_rot90_lane_f16(acc, lhs, rhs, 1);
@@ -289,29 +283,25 @@ float32x2_t test_vcmla_rot90_laneq_f32(float32x2_t acc, float32x2_t lhs, float32
 // CHECK-LABEL: @test_vcmlaq_rot90_lane_f32(
 // CHECK: [[CPLX:%.*]] = bitcast <2 x float> %rhs to i64
 // CHECK: [[CPLX_VEC:%.*]] = insertelement <2 x i64> undef, i64 [[CPLX]], i64 0
-// CHECK: [[DUP:%.*]] = shufflevector <2 x i64> [[CPLX_VEC]], <2 x i64> poison, <2 x i32> zeroinitializer
-// CHECK: [[DUP_FLT:%.*]] = bitcast <2 x i64> [[DUP]] to <4 x float>
-// CHECK: [[RES:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot90.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> [[DUP_FLT]])
+// CHECK: [[CPLX2:%.*]] = bitcast <2 x i64> [[CPLX_VEC]] to <4 x float>
+// CHECK: [[DUP:%.*]] = shufflevector <4 x float> [[CPLX2]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+// CHECK: [[RES:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot90.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> [[DUP]])
 // CHECK: ret <4 x float> [[RES]]
 float32x4_t test_vcmlaq_rot90_lane_f32(float32x4_t acc, float32x4_t lhs, float32x2_t rhs) {
   return vcmlaq_rot90_lane_f32(acc, lhs, rhs, 0);
 }
 
 // CHECK-LABEL: @test_vcmlaq_rot90_laneq_f32(
-// CHECK: [[CPLX:%.*]] = bitcast <4 x float> %rhs to <2 x i64>
-// CHECK: [[DUP:%.*]] = shufflevector <2 x i64> [[CPLX]], <2 x i64> undef, <2 x i32> <i32 1, i32 1>
-// CHECK: [[DUP_FLT:%.*]] = bitcast <2 x i64> [[DUP]] to <4 x float>
-// CHECK: [[RES:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot90.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> [[DUP_FLT]])
+// CHECK: [[DUP:%.*]] = shufflevector <4 x float> %rhs, <4 x float> poison, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
+// CHECK: [[RES:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot90.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> [[DUP]])
 // CHECK: ret <4 x float> [[RES]]
 float32x4_t test_vcmlaq_rot90_laneq_f32(float32x4_t acc, float32x4_t lhs, float32x4_t rhs) {
   return vcmlaq_rot90_laneq_f32(acc, lhs, rhs, 1);
 }
 
 // CHECK-LABEL: @test_vcmla_rot180_lane_f16(
-// CHECK: [[CPLX:%.*]] = bitcast <4 x half> %rhs to <2 x i32>
-// CHECK: [[DUP:%.*]] = shufflevector <2 x i32> [[CPLX]], <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-// CHECK: [[DUP_FLT:%.*]] = bitcast <2 x i32> [[DUP]] to <4 x half>
-// CHECK: [[RES:%.*]] = call <4 x half> @llvm.aarch64.neon.vcmla.rot180.v4f16(<4 x half> %acc, <4 x half> %lhs, <4 x half> [[DUP_FLT]])
+// CHECK: [[DUP:%.*]] = shufflevector <4 x half> %rhs, <4 x half> poison, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
+// CHECK: [[RES:%.*]] = call <4 x half> @llvm.aarch64.neon.vcmla.rot180.v4f16(<4 x half> %acc, <4 x half> %lhs, <4 x half> [[DUP]])
 // CHECK: ret <4 x half> [[RES]]
 float16x4_t test_vcmla_rot180_lane_f16(float16x4_t acc, float16x4_t lhs, float16x4_t rhs) {
   return vcmla_rot180_lane_f16(acc, lhs, rhs, 1);
@@ -369,29 +359,25 @@ float32x2_t test_vcmla_rot180_laneq_f32(float32x2_t acc, float32x2_t lhs, float3
 // CHECK-LABEL: @test_vcmlaq_rot180_lane_f32(
 // CHECK: [[CPLX:%.*]] = bitcast <2 x float> %rhs to i64
 // CHECK: [[CPLX_VEC:%.*]] = insertelement <2 x i64> undef, i64 [[CPLX]], i64 0
-// CHECK: [[DUP:%.*]] = shufflevector <2 x i64> [[CPLX_VEC]], <2 x i64> poison, <2 x i32> zeroinitializer
-// CHECK: [[DUP_FLT:%.*]] = bitcast <2 x i64> [[DUP]] to <4 x float>
-// CHECK: [[RES:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot180.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> [[DUP_FLT]])
+// CHECK: [[CPLX2:%.*]] = bitcast <2 x i64> [[CPLX_VEC]] to <4 x float>
+// CHECK: [[DUP:%.*]] = shufflevector <4 x float> [[CPLX2]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+// CHECK: [[RES:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot180.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> [[DUP]])
 // CHECK: ret <4 x float> [[RES]]
 float32x4_t test_vcmlaq_rot180_lane_f32(float32x4_t acc, float32x4_t lhs, float32x2_t rhs) {
   return vcmlaq_rot180_lane_f32(acc, lhs, rhs, 0);
 }
 
 // CHECK-LABEL: @test_vcmlaq_rot180_laneq_f32(
-// CHECK: [[CPLX:%.*]] = bitcast <4 x float> %rhs to <2 x i64>
-// CHECK: [[DUP:%.*]] = shufflevector <2 x i64> [[CPLX]], <2 x i64> undef, <2 x i32> <i32 1, i32 1>
-// CHECK: [[DUP_FLT:%.*]] = bitcast <2 x i64> [[DUP]] to <4 x float>
-// CHECK: [[RES:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot180.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> [[DUP_FLT]])
+// CHECK: [[DUP:%.*]] = shufflevector <4 x float> %rhs, <4 x float> poison, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
+// CHECK: [[RES:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot180.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> [[DUP]])
 // CHECK: ret <4 x float> [[RES]]
 float32x4_t test_vcmlaq_rot180_laneq_f32(float32x4_t acc, float32x4_t lhs, float32x4_t rhs) {
   return vcmlaq_rot180_laneq_f32(acc, lhs, rhs, 1);
 }
 
 // CHECK-LABEL: @test_vcmla_rot270_lane_f16(
-// CHECK: [[CPLX:%.*]] = bitcast <4 x half> %rhs to <2 x i32>
-// CHECK: [[DUP:%.*]] = shufflevector <2 x i32> [[CPLX]], <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-// CHECK: [[DUP_FLT:%.*]] = bitcast <2 x i32> [[DUP]] to <4 x half>
-// CHECK: [[RES:%.*]] = call <4 x half> @llvm.aarch64.neon.vcmla.rot270.v4f16(<4 x half> %acc, <4 x half> %lhs, <4 x half> [[DUP_FLT]])
+// CHECK: [[DUP:%.*]] = shufflevector <4 x half> %rhs, <4 x half> poison, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
+// CHECK: [[RES:%.*]] = call <4 x half> @llvm.aarch64.neon.vcmla.rot270.v4f16(<4 x half> %acc, <4 x half> %lhs, <4 x half> [[DUP]])
 // CHECK: ret <4 x half> [[RES]]
 float16x4_t test_vcmla_rot270_lane_f16(float16x4_t acc, float16x4_t lhs, float16x4_t rhs) {
   return vcmla_rot270_lane_f16(acc, lhs, rhs, 1);
@@ -449,19 +435,17 @@ float32x2_t test_vcmla_rot270_laneq_f32(float32x2_t acc, float32x2_t lhs, float3
 // CHECK-LABEL: @test_vcmlaq_rot270_lane_f32(
 // CHECK: [[CPLX:%.*]] = bitcast <2 x float> %rhs to i64
 // CHECK: [[CPLX_VEC:%.*]] = insertelement <2 x i64> undef, i64 [[CPLX]], i64 0
-// CHECK: [[DUP:%.*]] = shufflevector <2 x i64> [[CPLX_VEC]], <2 x i64> poison, <2 x i32> zeroinitializer
-// CHECK: [[DUP_FLT:%.*]] = bitcast <2 x i64> [[DUP]] to <4 x float>
-// CHECK: [[RES:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot270.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> [[DUP_FLT]])
+// CHECK: [[CPLX2:%.*]] = bitcast <2 x i64> [[DUP]] to <4 x float>
+// CHECK: [[DUP:%.*]] = shufflevector <4 x float> [[CPLX2]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+// CHECK: [[RES:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot270.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> [[DUP]])
 // CHECK: ret <4 x float> [[RES]]
 float32x4_t test_vcmlaq_rot270_lane_f32(float32x4_t acc, float32x4_t lhs, float32x2_t rhs) {
   return vcmlaq_rot270_lane_f32(acc, lhs, rhs, 0);
 }
 
 // CHECK-LABEL: @test_vcmlaq_rot270_laneq_f32(
-// CHECK: [[CPLX:%.*]] = bitcast <4 x float> %rhs to <2 x i64>
-// CHECK: [[DUP:%.*]] = shufflevector <2 x i64> [[CPLX]], <2 x i64> undef, <2 x i32> <i32 1, i32 1>
-// CHECK: [[DUP_FLT:%.*]] = bitcast <2 x i64> [[DUP]] to <4 x float>
-// CHECK: [[RES:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot270.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> [[DUP_FLT]])
+// CHECK: [[DUP:%.*]] = shufflevector <4 x float> %rhs, <4 x float> poison, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
+// CHECK: [[RES:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot270.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> [[DUP]])
 // CHECK: ret <4 x float> [[RES]]
 float32x4_t test_vcmlaq_rot270_laneq_f32(float32x4_t acc, float32x4_t lhs, float32x4_t rhs) {
   return vcmlaq_rot270_laneq_f32(acc, lhs, rhs, 1);

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -9812,14 +9812,37 @@ static SDValue GeneratePerfectShuffle(unsigned ID, SDValue V1,
         LHSID, V1, V2, PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
     EVT VT = OpLHS.getValueType();
     assert(RHSID < 8 && "Expected a lane index for RHSID!");
-    int MaskElt = getPFIDLane(ID, RHSID);
-    assert(MaskElt >= 0 && "Didn't expect an undef movlane index!");
-    unsigned ExtLane = MaskElt < 4 ? MaskElt : (MaskElt - 4);
-    SDValue Input = MaskElt < 4 ? V1 : V2;
-    // Be careful about creating illegal types. Use f16 instead of i16.
-    if (VT == MVT::v4i16) {
-      Input = DAG.getBitcast(MVT::v4f16, Input);
-      OpLHS = DAG.getBitcast(MVT::v4f16, OpLHS);
+    unsigned ExtLane = 0;
+    SDValue Input;
+
+    // OP_MOVLANE are either D movs (if bit 0x4 is set) or S movs. D movs
+    // convert into a higher type.
+    if (RHSID & 0x4) {
+      int MaskElt = getPFIDLane(ID, (RHSID & 0x01) << 1) >> 1;
+      if (MaskElt == -1)
+        MaskElt = (getPFIDLane(ID, ((RHSID & 0x01) << 1) + 1) - 1) >> 1;
+      assert(MaskElt >= 0 && "Didn't expect an undef movlane index!");
+      ExtLane = MaskElt < 2 ? MaskElt : (MaskElt - 2);
+      Input = MaskElt < 2 ? V1 : V2;
+      if (VT.getScalarSizeInBits() == 16) {
+        Input = DAG.getBitcast(MVT::v2f32, Input);
+        OpLHS = DAG.getBitcast(MVT::v2f32, OpLHS);
+      } else {
+        assert(VT.getScalarSizeInBits() == 32 &&
+               "Expected 16 or 32 bit shuffle elemements");
+        Input = DAG.getBitcast(MVT::v2f64, Input);
+        OpLHS = DAG.getBitcast(MVT::v2f64, OpLHS);
+      }
+    } else {
+      int MaskElt = getPFIDLane(ID, RHSID);
+      assert(MaskElt >= 0 && "Didn't expect an undef movlane index!");
+      ExtLane = MaskElt < 4 ? MaskElt : (MaskElt - 4);
+      Input = MaskElt < 4 ? V1 : V2;
+      // Be careful about creating illegal types. Use f16 instead of i16.
+      if (VT == MVT::v4i16) {
+        Input = DAG.getBitcast(MVT::v4f16, Input);
+        OpLHS = DAG.getBitcast(MVT::v4f16, OpLHS);
+      }
     }
     SDValue Ext = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
                               Input.getValueType().getVectorElementType(),