diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index 9c1dc83a3e1a1..e6af51629c5a0 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -8517,8 +8517,18 @@ def : Pat<(any_fadd (vector_extract (v8f16 FPR128:$Rn), (i64 0)), (vector_extract (v8f16 FPR128:$Rn), (i64 1))), (f16 (FADDPv2i16p (EXTRACT_SUBREG FPR128:$Rn, dsub)))>; -// Prefer using the bottom lanes of faddp Rn, Rn compared to -// faddp extractlow(Rn), extracthigh(Rn) +// Prefer using the bottom lanes of addp Rn, Rn compared to +// addp extractlow(Rn), extracthigh(Rn) +def : Pat<(AArch64addp (v2i32 (extract_subvector (v4i32 FPR128:$Rn), (i64 0))), + (v2i32 (extract_subvector (v4i32 FPR128:$Rn), (i64 2)))), + (v2i32 (EXTRACT_SUBREG (ADDPv4i32 $Rn, $Rn), dsub))>; +def : Pat<(AArch64addp (v4i16 (extract_subvector (v8i16 FPR128:$Rn), (i64 0))), + (v4i16 (extract_subvector (v8i16 FPR128:$Rn), (i64 4)))), + (v4i16 (EXTRACT_SUBREG (ADDPv8i16 $Rn, $Rn), dsub))>; +def : Pat<(AArch64addp (v8i8 (extract_subvector (v16i8 FPR128:$Rn), (i64 0))), + (v8i8 (extract_subvector (v16i8 FPR128:$Rn), (i64 8)))), + (v8i8 (EXTRACT_SUBREG (ADDPv16i8 $Rn, $Rn), dsub))>; + def : Pat<(AArch64faddp (v2f32 (extract_subvector (v4f32 FPR128:$Rn), (i64 0))), (v2f32 (extract_subvector (v4f32 FPR128:$Rn), (i64 2)))), (v2f32 (EXTRACT_SUBREG (FADDPv4f32 $Rn, $Rn), dsub))>; diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-add-pairwise.ll b/llvm/test/CodeGen/AArch64/arm64-neon-add-pairwise.ll index eefa5a9b43d0b..4fd40bb7e229d 100644 --- a/llvm/test/CodeGen/AArch64/arm64-neon-add-pairwise.ll +++ b/llvm/test/CodeGen/AArch64/arm64-neon-add-pairwise.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 -; RUN: llc -mtriple=arm64-none-linux-gnu -mattr=+neon < %s | FileCheck %s -; RUN: llc -global-isel=1 -mtriple=arm64-none-linux-gnu -mattr=+neon < %s | FileCheck %s +; RUN: llc -mtriple=arm64-none-linux-gnu -mattr=+neon < %s | FileCheck %s --check-prefixes=CHECK,CHECK-SD +; RUN: llc -mtriple=arm64-none-linux-gnu -mattr=+neon -global-isel < %s | FileCheck %s --check-prefixes=CHECK,CHECK-GI declare <8 x i8> @llvm.aarch64.neon.addp.v8i8(<8 x i8>, <8 x i8>) @@ -123,3 +123,78 @@ define i32 @test_vaddv.v2i32(<2 x i32> %a) { } declare i32 @llvm.aarch64.neon.saddv.i32.v2i32(<2 x i32>) + +define i32 @addp_v4i32(<4 x i32> %a, <4 x i32> %b) { +; CHECK-SD-LABEL: addp_v4i32: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: add v0.4s, v0.4s, v1.4s +; CHECK-SD-NEXT: addp v0.4s, v0.4s, v0.4s +; CHECK-SD-NEXT: dup v1.2s, v0.s[1] +; CHECK-SD-NEXT: add v0.2s, v0.2s, v1.2s +; CHECK-SD-NEXT: fmov w0, s0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: addp_v4i32: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: add v0.4s, v0.4s, v1.4s +; CHECK-GI-NEXT: ext v1.16b, v0.16b, v0.16b, #0 +; CHECK-GI-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-GI-NEXT: addp v0.2s, v1.2s, v0.2s +; CHECK-GI-NEXT: rev64 v1.2s, v0.2s +; CHECK-GI-NEXT: add v0.2s, v0.2s, v1.2s +; CHECK-GI-NEXT: fmov w0, s0 +; CHECK-GI-NEXT: ret + %1 = add <4 x i32> %a, %b + %2 = shufflevector <4 x i32> %1, <4 x i32> poison, <2 x i32> + %3 = shufflevector <4 x i32> %1, <4 x i32> poison, <2 x i32> + %4 = tail call <2 x i32> @llvm.aarch64.neon.addp.v2i32(<2 x i32> %2, <2 x i32> %3) + %5 = shufflevector <2 x i32> %4, <2 x i32> poison, <2 x i32> + %6 = add <2 x i32> %4, %5 + %7 = extractelement <2 x i32> %6, i64 0 + ret i32 %7 +} + +define <4 x i16> @addp_v8i16(<8 x i16> %a, <8 x i16> %b) { +; CHECK-SD-LABEL: addp_v8i16: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: add v0.8h, v0.8h, v1.8h +; CHECK-SD-NEXT: addp v0.8h, v0.8h, v0.8h +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: addp_v8i16: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: add v0.8h, v0.8h, v1.8h +; CHECK-GI-NEXT: ext v1.16b, v0.16b, v0.16b, #0 +; CHECK-GI-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-GI-NEXT: addp v0.4h, v1.4h, v0.4h +; CHECK-GI-NEXT: ret + %1 = add <8 x i16> %a, %b + %2 = shufflevector <8 x i16> %1, <8 x i16> poison, <4 x i32> + %3 = shufflevector <8 x i16> %1, <8 x i16> poison, <4 x i32> + %4 = tail call <4 x i16> @llvm.aarch64.neon.addp.v4i16(<4 x i16> %2, <4 x i16> %3) + ret <4 x i16> %4 +} + +define <8 x i8> @addp_v16i8(<16 x i8> %a, <16 x i8> %b) { +; CHECK-SD-LABEL: addp_v16i8: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: add v0.16b, v0.16b, v1.16b +; CHECK-SD-NEXT: addp v0.16b, v0.16b, v0.16b +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: addp_v16i8: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: add v0.16b, v0.16b, v1.16b +; CHECK-GI-NEXT: ext v1.16b, v0.16b, v0.16b, #0 +; CHECK-GI-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-GI-NEXT: addp v0.8b, v1.8b, v0.8b +; CHECK-GI-NEXT: ret + %1 = add <16 x i8> %a, %b + %2 = shufflevector <16 x i8> %1, <16 x i8> poison, <8 x i32> + %3 = shufflevector <16 x i8> %1, <16 x i8> poison, <8 x i32> + %4 = tail call <8 x i8> @llvm.aarch64.neon.addp.v8i8(<8 x i8> %2, <8 x i8> %3) + ret <8 x i8> %4 +} +