Skip to content
7 changes: 7 additions & 0 deletions llvm/lib/Target/AArch64/AArch64InstrGISel.td
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,12 @@ def G_SMULL : AArch64GenericInstruction {
let hasSideEffects = 0;
}

def G_PMULL : AArch64GenericInstruction {
let OutOperandList = (outs type0:$dst);
let InOperandList = (ins type1:$src1, type1:$src2);
let hasSideEffects = 0;
}

def G_UADDLP : AArch64GenericInstruction {
let OutOperandList = (outs type0:$dst);
let InOperandList = (ins type0:$src1);
Expand Down Expand Up @@ -273,6 +279,7 @@ def : GINodeEquiv<G_FCMGT, AArch64fcmgt>;

def : GINodeEquiv<G_BSP, AArch64bsp>;

def : GINodeEquiv<G_PMULL, AArch64pmull>;
def : GINodeEquiv<G_UMULL, AArch64umull>;
def : GINodeEquiv<G_SMULL, AArch64smull>;

Expand Down
3 changes: 3 additions & 0 deletions llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1809,6 +1809,9 @@ bool AArch64LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
return LowerBinOp(TargetOpcode::G_FMAXNUM);
case Intrinsic::aarch64_neon_fminnm:
return LowerBinOp(TargetOpcode::G_FMINNUM);
case Intrinsic::aarch64_neon_pmull:
case Intrinsic::aarch64_neon_pmull64:
return LowerBinOp(AArch64::G_PMULL);
case Intrinsic::aarch64_neon_smull:
return LowerBinOp(AArch64::G_SMULL);
case Intrinsic::aarch64_neon_umull:
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -560,6 +560,7 @@ bool AArch64RegisterBankInfo::onlyUsesFP(const MachineInstr &MI,
case TargetOpcode::G_FCMP:
case TargetOpcode::G_LROUND:
case TargetOpcode::G_LLROUND:
case AArch64::G_PMULL:
return true;
case TargetOpcode::G_INTRINSIC:
switch (cast<GIntrinsic>(MI).getIntrinsicID()) {
Expand Down
84 changes: 62 additions & 22 deletions llvm/test/CodeGen/AArch64/aarch64-smull.ll
Original file line number Diff line number Diff line change
@@ -1,10 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-NEON
; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+sve < %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SVE
; RUN: llc -mtriple=aarch64 -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI

; CHECK-GI: warning: Instruction selection used fallback path for pmlsl2_v8i16_uzp1
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for pmlsl_pmlsl2_v8i16_uzp1
; RUN: llc -mtriple=aarch64 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI

define <8 x i16> @smull_v8i8_v8i16(ptr %A, ptr %B) nounwind {
; CHECK-LABEL: smull_v8i8_v8i16:
Expand Down Expand Up @@ -1832,14 +1829,33 @@ entry:
}

define void @pmlsl2_v8i16_uzp1(<16 x i8> %0, <8 x i16> %1, ptr %2, ptr %3) {
; CHECK-LABEL: pmlsl2_v8i16_uzp1:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr q2, [x1, #16]
; CHECK-NEXT: uzp1 v2.16b, v0.16b, v2.16b
; CHECK-NEXT: pmull2 v0.8h, v0.16b, v2.16b
; CHECK-NEXT: sub v0.8h, v1.8h, v0.8h
; CHECK-NEXT: str q0, [x0]
; CHECK-NEXT: ret
; CHECK-NEON-LABEL: pmlsl2_v8i16_uzp1:
; CHECK-NEON: // %bb.0:
; CHECK-NEON-NEXT: ldr q2, [x1, #16]
; CHECK-NEON-NEXT: uzp1 v2.16b, v0.16b, v2.16b
; CHECK-NEON-NEXT: pmull2 v0.8h, v0.16b, v2.16b
; CHECK-NEON-NEXT: sub v0.8h, v1.8h, v0.8h
; CHECK-NEON-NEXT: str q0, [x0]
; CHECK-NEON-NEXT: ret
;
; CHECK-SVE-LABEL: pmlsl2_v8i16_uzp1:
; CHECK-SVE: // %bb.0:
; CHECK-SVE-NEXT: ldr q2, [x1, #16]
; CHECK-SVE-NEXT: uzp1 v2.16b, v0.16b, v2.16b
; CHECK-SVE-NEXT: pmull2 v0.8h, v0.16b, v2.16b
; CHECK-SVE-NEXT: sub v0.8h, v1.8h, v0.8h
; CHECK-SVE-NEXT: str q0, [x0]
; CHECK-SVE-NEXT: ret
;
; CHECK-GI-LABEL: pmlsl2_v8i16_uzp1:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: ldr q2, [x1, #16]
; CHECK-GI-NEXT: mov d0, v0.d[1]
; CHECK-GI-NEXT: xtn v2.8b, v2.8h
; CHECK-GI-NEXT: pmull v0.8h, v0.8b, v2.8b
; CHECK-GI-NEXT: sub v0.8h, v1.8h, v0.8h
; CHECK-GI-NEXT: str q0, [x0]
; CHECK-GI-NEXT: ret
%5 = getelementptr inbounds i32, ptr %3, i64 4
%6 = load <8 x i16>, ptr %5, align 4
%7 = trunc <8 x i16> %6 to <8 x i8>
Expand Down Expand Up @@ -1991,16 +2007,40 @@ define void @umlsl2_v4i32_uzp1(<8 x i16> %0, <4 x i32> %1, ptr %2, ptr %3) {
}

define void @pmlsl_pmlsl2_v8i16_uzp1(<16 x i8> %0, <8 x i16> %1, ptr %2, ptr %3, i32 %4) {
; CHECK-LABEL: pmlsl_pmlsl2_v8i16_uzp1:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: ldp q2, q3, [x1]
; CHECK-NEXT: uzp1 v2.16b, v2.16b, v3.16b
; CHECK-NEXT: pmull v3.8h, v0.8b, v2.8b
; CHECK-NEXT: pmull2 v0.8h, v0.16b, v2.16b
; CHECK-NEXT: add v0.8h, v3.8h, v0.8h
; CHECK-NEXT: sub v0.8h, v1.8h, v0.8h
; CHECK-NEXT: str q0, [x0]
; CHECK-NEXT: ret
; CHECK-NEON-LABEL: pmlsl_pmlsl2_v8i16_uzp1:
; CHECK-NEON: // %bb.0: // %entry
; CHECK-NEON-NEXT: ldp q2, q3, [x1]
; CHECK-NEON-NEXT: uzp1 v2.16b, v2.16b, v3.16b
; CHECK-NEON-NEXT: pmull v3.8h, v0.8b, v2.8b
; CHECK-NEON-NEXT: pmull2 v0.8h, v0.16b, v2.16b
; CHECK-NEON-NEXT: add v0.8h, v3.8h, v0.8h
; CHECK-NEON-NEXT: sub v0.8h, v1.8h, v0.8h
; CHECK-NEON-NEXT: str q0, [x0]
; CHECK-NEON-NEXT: ret
;
; CHECK-SVE-LABEL: pmlsl_pmlsl2_v8i16_uzp1:
; CHECK-SVE: // %bb.0: // %entry
; CHECK-SVE-NEXT: ldp q2, q3, [x1]
; CHECK-SVE-NEXT: uzp1 v2.16b, v2.16b, v3.16b
; CHECK-SVE-NEXT: pmull v3.8h, v0.8b, v2.8b
; CHECK-SVE-NEXT: pmull2 v0.8h, v0.16b, v2.16b
; CHECK-SVE-NEXT: add v0.8h, v3.8h, v0.8h
; CHECK-SVE-NEXT: sub v0.8h, v1.8h, v0.8h
; CHECK-SVE-NEXT: str q0, [x0]
; CHECK-SVE-NEXT: ret
;
; CHECK-GI-LABEL: pmlsl_pmlsl2_v8i16_uzp1:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: ldp q2, q3, [x1]
; CHECK-GI-NEXT: mov d4, v0.d[1]
; CHECK-GI-NEXT: xtn v2.8b, v2.8h
; CHECK-GI-NEXT: xtn v3.8b, v3.8h
; CHECK-GI-NEXT: pmull v0.8h, v0.8b, v2.8b
; CHECK-GI-NEXT: pmull v2.8h, v4.8b, v3.8b
; CHECK-GI-NEXT: add v0.8h, v0.8h, v2.8h
; CHECK-GI-NEXT: sub v0.8h, v1.8h, v0.8h
; CHECK-GI-NEXT: str q0, [x0]
; CHECK-GI-NEXT: ret
entry:
%5 = load <8 x i16>, ptr %3, align 4
%6 = trunc <8 x i16> %5 to <8 x i8>
Expand Down
55 changes: 35 additions & 20 deletions llvm/test/CodeGen/AArch64/arm64-neon-3vdiff.ll
Original file line number Diff line number Diff line change
@@ -1,11 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon,+aes | FileCheck %s --check-prefixes=CHECK,CHECK-SD
; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon,+aes -global-isel -global-isel-abort=2 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI

; CHECK-GI: warning: Instruction selection used fallback path for test_vmull_p8
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_vmull_high_p8
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_vmull_p64
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_vmull_high_p64
; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon,+aes -global-isel | FileCheck %s --check-prefixes=CHECK,CHECK-GI

declare <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8>, <8 x i8>)
declare <16 x i8> @llvm.aarch64.neon.pmull64(i64, i64) #5
Expand Down Expand Up @@ -2721,27 +2716,47 @@ entry:
}

define i128 @test_vmull_p64(i64 %a, i64 %b) #4 {
; CHECK-LABEL: test_vmull_p64:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: fmov d0, x1
; CHECK-NEXT: fmov d1, x0
; CHECK-NEXT: pmull v0.1q, v1.1d, v0.1d
; CHECK-NEXT: mov x1, v0.d[1]
; CHECK-NEXT: fmov x0, d0
; CHECK-NEXT: ret
; CHECK-SD-LABEL: test_vmull_p64:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: fmov d0, x1
; CHECK-SD-NEXT: fmov d1, x0
; CHECK-SD-NEXT: pmull v0.1q, v1.1d, v0.1d
; CHECK-SD-NEXT: mov x1, v0.d[1]
; CHECK-SD-NEXT: fmov x0, d0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: test_vmull_p64:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: fmov d0, x0
; CHECK-GI-NEXT: fmov d1, x1
; CHECK-GI-NEXT: pmull v0.1q, v0.1d, v1.1d
; CHECK-GI-NEXT: mov d1, v0.d[1]
; CHECK-GI-NEXT: fmov x0, d0
; CHECK-GI-NEXT: fmov x1, d1
; CHECK-GI-NEXT: ret
entry:
%vmull2.i = tail call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %a, i64 %b)
%vmull3.i = bitcast <16 x i8> %vmull2.i to i128
ret i128 %vmull3.i
}

define i128 @test_vmull_high_p64(<2 x i64> %a, <2 x i64> %b) #4 {
; CHECK-LABEL: test_vmull_high_p64:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: pmull2 v0.1q, v0.2d, v1.2d
; CHECK-NEXT: mov x1, v0.d[1]
; CHECK-NEXT: fmov x0, d0
; CHECK-NEXT: ret
; CHECK-SD-LABEL: test_vmull_high_p64:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: pmull2 v0.1q, v0.2d, v1.2d
; CHECK-SD-NEXT: mov x1, v0.d[1]
; CHECK-SD-NEXT: fmov x0, d0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: test_vmull_high_p64:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: mov d0, v0.d[1]
; CHECK-GI-NEXT: mov d1, v1.d[1]
; CHECK-GI-NEXT: pmull v0.1q, v0.1d, v1.1d
; CHECK-GI-NEXT: mov d1, v0.d[1]
; CHECK-GI-NEXT: fmov x0, d0
; CHECK-GI-NEXT: fmov x1, d1
; CHECK-GI-NEXT: ret
entry:
%0 = extractelement <2 x i64> %a, i32 1
%1 = extractelement <2 x i64> %b, i32 1
Expand Down
Loading