Skip to content

Commit

Permalink
[AArch64] Teach perfect shuffles tables about D-lane movs
Browse files Browse the repository at this point in the history
Similar to D123386, this adds D-Movs to the AArch64 perfect shuffle
tables, slightly lowering the costs a little more. This is a rough
improvement in general, especially if you ignore mov v0.16b, v2.16b type
moves that are often artefacts of the calling convention.

The D register movs are encoded as (0x4 | LaneIdx), and to generate a D
register move we are required to bitcast into a higher type, but it is
otherwise very similar to the S-lane mov's already supported.

Differential Revision: https://reviews.llvm.org/D125477
  • Loading branch information
davemgreen committed May 17, 2022
1 parent bd93df9 commit 4c6a070
Show file tree
Hide file tree
Showing 10 changed files with 498 additions and 476 deletions.
72 changes: 28 additions & 44 deletions clang/test/CodeGen/aarch64-neon-vcmla.c
Expand Up @@ -148,10 +148,8 @@ float64x2_t test_vcmlaq_rot270_f64(float64x2_t acc, float64x2_t lhs, float64x2_t
}

// CHECK-LABEL: @test_vcmla_lane_f16(
// CHECK: [[CPLX:%.*]] = bitcast <4 x half> %rhs to <2 x i32>
// CHECK: [[DUP:%.*]] = shufflevector <2 x i32> [[CPLX]], <2 x i32> undef, <2 x i32> <i32 1, i32 1>
// CHECK: [[DUP_FLT:%.*]] = bitcast <2 x i32> [[DUP]] to <4 x half>
// CHECK: [[RES:%.*]] = call <4 x half> @llvm.aarch64.neon.vcmla.rot0.v4f16(<4 x half> %acc, <4 x half> %lhs, <4 x half> [[DUP_FLT]])
// CHECK: [[DUP:%.*]] = shufflevector <4 x half> %rhs, <4 x half> poison, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
// CHECK: [[RES:%.*]] = call <4 x half> @llvm.aarch64.neon.vcmla.rot0.v4f16(<4 x half> %acc, <4 x half> %lhs, <4 x half> [[DUP]])
// CHECK: ret <4 x half> [[RES]]
float16x4_t test_vcmla_lane_f16(float16x4_t acc, float16x4_t lhs, float16x4_t rhs) {
return vcmla_lane_f16(acc, lhs, rhs, 1);
Expand Down Expand Up @@ -209,29 +207,25 @@ float32x2_t test_vcmla_laneq_f32(float32x2_t acc, float32x2_t lhs, float32x4_t r
// CHECK-LABEL: @test_vcmlaq_lane_f32(
// CHECK: [[CPLX:%.*]] = bitcast <2 x float> %rhs to i64
// CHECK: [[CPLX_VEC:%.*]] = insertelement <2 x i64> undef, i64 [[CPLX]], i64 0
// CHECK: [[DUP:%.*]] = shufflevector <2 x i64> [[CPLX_VEC]], <2 x i64> poison, <2 x i32> zeroinitializer
// CHECK: [[DUP_FLT:%.*]] = bitcast <2 x i64> [[DUP]] to <4 x float>
// CHECK: [[RES:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot0.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> [[DUP_FLT]])
// CHECK: [[CPLX2:%.*]] = bitcast <2 x i64> [[CPLX_VEC]] to <4 x float>
// CHECK: [[DUP:%.*]] = shufflevector <4 x float> [[CPLX2]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
// CHECK: [[RES:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot0.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> [[DUP]])
// CHECK: ret <4 x float> [[RES]]
float32x4_t test_vcmlaq_lane_f32(float32x4_t acc, float32x4_t lhs, float32x2_t rhs) {
return vcmlaq_lane_f32(acc, lhs, rhs, 0);
}

// CHECK-LABEL: @test_vcmlaq_laneq_f32(
// CHECK: [[CPLX:%.*]] = bitcast <4 x float> %rhs to <2 x i64>
// CHECK: [[DUP:%.*]] = shufflevector <2 x i64> [[CPLX]], <2 x i64> undef, <2 x i32> <i32 1, i32 1>
// CHECK: [[DUP_FLT:%.*]] = bitcast <2 x i64> [[DUP]] to <4 x float>
// CHECK: [[RES:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot0.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> [[DUP_FLT]])
// CHECK: [[DUP:%.*]] = shufflevector <4 x float> %rhs, <4 x float> poison, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
// CHECK: [[RES:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot0.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> [[DUP]])
// CHECK: ret <4 x float> [[RES]]
float32x4_t test_vcmlaq_laneq_f32(float32x4_t acc, float32x4_t lhs, float32x4_t rhs) {
return vcmlaq_laneq_f32(acc, lhs, rhs, 1);
}

// CHECK-LABEL: @test_vcmla_rot90_lane_f16(
// CHECK: [[CPLX:%.*]] = bitcast <4 x half> %rhs to <2 x i32>
// CHECK: [[DUP:%.*]] = shufflevector <2 x i32> [[CPLX]], <2 x i32> undef, <2 x i32> <i32 1, i32 1>
// CHECK: [[DUP_FLT:%.*]] = bitcast <2 x i32> [[DUP]] to <4 x half>
// CHECK: [[RES:%.*]] = call <4 x half> @llvm.aarch64.neon.vcmla.rot90.v4f16(<4 x half> %acc, <4 x half> %lhs, <4 x half> [[DUP_FLT]])
// CHECK: [[DUP:%.*]] = shufflevector <4 x half> %rhs, <4 x half> poison, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
// CHECK: [[RES:%.*]] = call <4 x half> @llvm.aarch64.neon.vcmla.rot90.v4f16(<4 x half> %acc, <4 x half> %lhs, <4 x half> [[DUP]])
// CHECK: ret <4 x half> [[RES]]
float16x4_t test_vcmla_rot90_lane_f16(float16x4_t acc, float16x4_t lhs, float16x4_t rhs) {
return vcmla_rot90_lane_f16(acc, lhs, rhs, 1);
Expand Down Expand Up @@ -289,29 +283,25 @@ float32x2_t test_vcmla_rot90_laneq_f32(float32x2_t acc, float32x2_t lhs, float32
// CHECK-LABEL: @test_vcmlaq_rot90_lane_f32(
// CHECK: [[CPLX:%.*]] = bitcast <2 x float> %rhs to i64
// CHECK: [[CPLX_VEC:%.*]] = insertelement <2 x i64> undef, i64 [[CPLX]], i64 0
// CHECK: [[DUP:%.*]] = shufflevector <2 x i64> [[CPLX_VEC]], <2 x i64> poison, <2 x i32> zeroinitializer
// CHECK: [[DUP_FLT:%.*]] = bitcast <2 x i64> [[DUP]] to <4 x float>
// CHECK: [[RES:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot90.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> [[DUP_FLT]])
// CHECK: [[CPLX2:%.*]] = bitcast <2 x i64> [[CPLX_VEC]] to <4 x float>
// CHECK: [[DUP:%.*]] = shufflevector <4 x float> [[CPLX2]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
// CHECK: [[RES:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot90.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> [[DUP]])
// CHECK: ret <4 x float> [[RES]]
float32x4_t test_vcmlaq_rot90_lane_f32(float32x4_t acc, float32x4_t lhs, float32x2_t rhs) {
return vcmlaq_rot90_lane_f32(acc, lhs, rhs, 0);
}

// CHECK-LABEL: @test_vcmlaq_rot90_laneq_f32(
// CHECK: [[CPLX:%.*]] = bitcast <4 x float> %rhs to <2 x i64>
// CHECK: [[DUP:%.*]] = shufflevector <2 x i64> [[CPLX]], <2 x i64> undef, <2 x i32> <i32 1, i32 1>
// CHECK: [[DUP_FLT:%.*]] = bitcast <2 x i64> [[DUP]] to <4 x float>
// CHECK: [[RES:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot90.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> [[DUP_FLT]])
// CHECK: [[DUP:%.*]] = shufflevector <4 x float> %rhs, <4 x float> poison, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
// CHECK: [[RES:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot90.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> [[DUP]])
// CHECK: ret <4 x float> [[RES]]
float32x4_t test_vcmlaq_rot90_laneq_f32(float32x4_t acc, float32x4_t lhs, float32x4_t rhs) {
return vcmlaq_rot90_laneq_f32(acc, lhs, rhs, 1);
}

// CHECK-LABEL: @test_vcmla_rot180_lane_f16(
// CHECK: [[CPLX:%.*]] = bitcast <4 x half> %rhs to <2 x i32>
// CHECK: [[DUP:%.*]] = shufflevector <2 x i32> [[CPLX]], <2 x i32> undef, <2 x i32> <i32 1, i32 1>
// CHECK: [[DUP_FLT:%.*]] = bitcast <2 x i32> [[DUP]] to <4 x half>
// CHECK: [[RES:%.*]] = call <4 x half> @llvm.aarch64.neon.vcmla.rot180.v4f16(<4 x half> %acc, <4 x half> %lhs, <4 x half> [[DUP_FLT]])
// CHECK: [[DUP:%.*]] = shufflevector <4 x half> %rhs, <4 x half> poison, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
// CHECK: [[RES:%.*]] = call <4 x half> @llvm.aarch64.neon.vcmla.rot180.v4f16(<4 x half> %acc, <4 x half> %lhs, <4 x half> [[DUP]])
// CHECK: ret <4 x half> [[RES]]
float16x4_t test_vcmla_rot180_lane_f16(float16x4_t acc, float16x4_t lhs, float16x4_t rhs) {
return vcmla_rot180_lane_f16(acc, lhs, rhs, 1);
Expand Down Expand Up @@ -369,29 +359,25 @@ float32x2_t test_vcmla_rot180_laneq_f32(float32x2_t acc, float32x2_t lhs, float3
// CHECK-LABEL: @test_vcmlaq_rot180_lane_f32(
// CHECK: [[CPLX:%.*]] = bitcast <2 x float> %rhs to i64
// CHECK: [[CPLX_VEC:%.*]] = insertelement <2 x i64> undef, i64 [[CPLX]], i64 0
// CHECK: [[DUP:%.*]] = shufflevector <2 x i64> [[CPLX_VEC]], <2 x i64> poison, <2 x i32> zeroinitializer
// CHECK: [[DUP_FLT:%.*]] = bitcast <2 x i64> [[DUP]] to <4 x float>
// CHECK: [[RES:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot180.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> [[DUP_FLT]])
// CHECK: [[CPLX2:%.*]] = bitcast <2 x i64> [[CPLX_VEC]] to <4 x float>
// CHECK: [[DUP:%.*]] = shufflevector <4 x float> [[CPLX2]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
// CHECK: [[RES:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot180.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> [[DUP]])
// CHECK: ret <4 x float> [[RES]]
float32x4_t test_vcmlaq_rot180_lane_f32(float32x4_t acc, float32x4_t lhs, float32x2_t rhs) {
return vcmlaq_rot180_lane_f32(acc, lhs, rhs, 0);
}

// CHECK-LABEL: @test_vcmlaq_rot180_laneq_f32(
// CHECK: [[CPLX:%.*]] = bitcast <4 x float> %rhs to <2 x i64>
// CHECK: [[DUP:%.*]] = shufflevector <2 x i64> [[CPLX]], <2 x i64> undef, <2 x i32> <i32 1, i32 1>
// CHECK: [[DUP_FLT:%.*]] = bitcast <2 x i64> [[DUP]] to <4 x float>
// CHECK: [[RES:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot180.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> [[DUP_FLT]])
// CHECK: [[DUP:%.*]] = shufflevector <4 x float> %rhs, <4 x float> poison, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
// CHECK: [[RES:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot180.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> [[DUP]])
// CHECK: ret <4 x float> [[RES]]
float32x4_t test_vcmlaq_rot180_laneq_f32(float32x4_t acc, float32x4_t lhs, float32x4_t rhs) {
return vcmlaq_rot180_laneq_f32(acc, lhs, rhs, 1);
}

// CHECK-LABEL: @test_vcmla_rot270_lane_f16(
// CHECK: [[CPLX:%.*]] = bitcast <4 x half> %rhs to <2 x i32>
// CHECK: [[DUP:%.*]] = shufflevector <2 x i32> [[CPLX]], <2 x i32> undef, <2 x i32> <i32 1, i32 1>
// CHECK: [[DUP_FLT:%.*]] = bitcast <2 x i32> [[DUP]] to <4 x half>
// CHECK: [[RES:%.*]] = call <4 x half> @llvm.aarch64.neon.vcmla.rot270.v4f16(<4 x half> %acc, <4 x half> %lhs, <4 x half> [[DUP_FLT]])
// CHECK: [[DUP:%.*]] = shufflevector <4 x half> %rhs, <4 x half> poison, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
// CHECK: [[RES:%.*]] = call <4 x half> @llvm.aarch64.neon.vcmla.rot270.v4f16(<4 x half> %acc, <4 x half> %lhs, <4 x half> [[DUP]])
// CHECK: ret <4 x half> [[RES]]
float16x4_t test_vcmla_rot270_lane_f16(float16x4_t acc, float16x4_t lhs, float16x4_t rhs) {
return vcmla_rot270_lane_f16(acc, lhs, rhs, 1);
Expand Down Expand Up @@ -449,19 +435,17 @@ float32x2_t test_vcmla_rot270_laneq_f32(float32x2_t acc, float32x2_t lhs, float3
// CHECK-LABEL: @test_vcmlaq_rot270_lane_f32(
// CHECK: [[CPLX:%.*]] = bitcast <2 x float> %rhs to i64
// CHECK: [[CPLX_VEC:%.*]] = insertelement <2 x i64> undef, i64 [[CPLX]], i64 0
// CHECK: [[DUP:%.*]] = shufflevector <2 x i64> [[CPLX_VEC]], <2 x i64> poison, <2 x i32> zeroinitializer
// CHECK: [[DUP_FLT:%.*]] = bitcast <2 x i64> [[DUP]] to <4 x float>
// CHECK: [[RES:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot270.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> [[DUP_FLT]])
// CHECK: [[CPLX2:%.*]] = bitcast <2 x i64> [[DUP]] to <4 x float>
// CHECK: [[DUP:%.*]] = shufflevector <4 x float> [[CPLX2]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
// CHECK: [[RES:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot270.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> [[DUP]])
// CHECK: ret <4 x float> [[RES]]
float32x4_t test_vcmlaq_rot270_lane_f32(float32x4_t acc, float32x4_t lhs, float32x2_t rhs) {
return vcmlaq_rot270_lane_f32(acc, lhs, rhs, 0);
}

// CHECK-LABEL: @test_vcmlaq_rot270_laneq_f32(
// CHECK: [[CPLX:%.*]] = bitcast <4 x float> %rhs to <2 x i64>
// CHECK: [[DUP:%.*]] = shufflevector <2 x i64> [[CPLX]], <2 x i64> undef, <2 x i32> <i32 1, i32 1>
// CHECK: [[DUP_FLT:%.*]] = bitcast <2 x i64> [[DUP]] to <4 x float>
// CHECK: [[RES:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot270.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> [[DUP_FLT]])
// CHECK: [[DUP:%.*]] = shufflevector <4 x float> %rhs, <4 x float> poison, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
// CHECK: [[RES:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot270.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> [[DUP]])
// CHECK: ret <4 x float> [[RES]]
float32x4_t test_vcmlaq_rot270_laneq_f32(float32x4_t acc, float32x4_t lhs, float32x4_t rhs) {
return vcmlaq_rot270_laneq_f32(acc, lhs, rhs, 1);
Expand Down
39 changes: 31 additions & 8 deletions llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
Expand Up @@ -9812,14 +9812,37 @@ static SDValue GeneratePerfectShuffle(unsigned ID, SDValue V1,
LHSID, V1, V2, PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
EVT VT = OpLHS.getValueType();
assert(RHSID < 8 && "Expected a lane index for RHSID!");
int MaskElt = getPFIDLane(ID, RHSID);
assert(MaskElt >= 0 && "Didn't expect an undef movlane index!");
unsigned ExtLane = MaskElt < 4 ? MaskElt : (MaskElt - 4);
SDValue Input = MaskElt < 4 ? V1 : V2;
// Be careful about creating illegal types. Use f16 instead of i16.
if (VT == MVT::v4i16) {
Input = DAG.getBitcast(MVT::v4f16, Input);
OpLHS = DAG.getBitcast(MVT::v4f16, OpLHS);
unsigned ExtLane = 0;
SDValue Input;

// OP_MOVLANE are either D movs (if bit 0x4 is set) or S movs. D movs
// convert into a higher type.
if (RHSID & 0x4) {
int MaskElt = getPFIDLane(ID, (RHSID & 0x01) << 1) >> 1;
if (MaskElt == -1)
MaskElt = (getPFIDLane(ID, ((RHSID & 0x01) << 1) + 1) - 1) >> 1;
assert(MaskElt >= 0 && "Didn't expect an undef movlane index!");
ExtLane = MaskElt < 2 ? MaskElt : (MaskElt - 2);
Input = MaskElt < 2 ? V1 : V2;
if (VT.getScalarSizeInBits() == 16) {
Input = DAG.getBitcast(MVT::v2f32, Input);
OpLHS = DAG.getBitcast(MVT::v2f32, OpLHS);
} else {
assert(VT.getScalarSizeInBits() == 32 &&
"Expected 16 or 32 bit shuffle elemements");
Input = DAG.getBitcast(MVT::v2f64, Input);
OpLHS = DAG.getBitcast(MVT::v2f64, OpLHS);
}
} else {
int MaskElt = getPFIDLane(ID, RHSID);
assert(MaskElt >= 0 && "Didn't expect an undef movlane index!");
ExtLane = MaskElt < 4 ? MaskElt : (MaskElt - 4);
Input = MaskElt < 4 ? V1 : V2;
// Be careful about creating illegal types. Use f16 instead of i16.
if (VT == MVT::v4i16) {
Input = DAG.getBitcast(MVT::v4f16, Input);
OpLHS = DAG.getBitcast(MVT::v4f16, OpLHS);
}
}
SDValue Ext = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
Input.getValueType().getVectorElementType(),
Expand Down

0 comments on commit 4c6a070

Please sign in to comment.