-
Notifications
You must be signed in to change notification settings - Fork 11k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AArch64][GlobalISel] Push ADD/SUB through Extend Instructions #90964
Conversation
@llvm/pr-subscribers-llvm-globalisel @llvm/pr-subscribers-backend-aarch64 Author: None (chuongg3) ChangesThe regression in one test is due to a SUB instruction being pushed through the extend, leaving behind the abs instruction, which prevents it from selecting uabdl instructions shown below:
This is intended to be fixed in a follow up patch Patch is 30.88 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/90964.diff 6 Files Affected:
diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td
index 10cad6d1924407..01237640133268 100644
--- a/llvm/lib/Target/AArch64/AArch64Combine.td
+++ b/llvm/lib/Target/AArch64/AArch64Combine.td
@@ -52,6 +52,16 @@ def ext_uaddv_to_uaddlv : GICombineRule<
(apply [{ applyExtUaddvToUaddlv(*${root}, MRI, B, Observer, ${matchinfo}); }])
>;
+// Push G_ADD and G_SUB through G_{Z/S}EXT to allow better selection of addl/subl instructions
+// add(ext, ext) => ext(add(ext, ext))
+def push_add_matchinfo :
+ GIDefMatchData<"std::tuple<bool, Register, Register, Register>">;
+def push_add_sub_through_ext : GICombineRule<
+ (defs root:$root, push_add_matchinfo:$matchinfo),
+ (match (wip_match_opcode G_ADD, G_SUB):$root,
+ [{ return matchPushAddSubExt(*${root}, MRI, ${matchinfo}); }]),
+ (apply [{ applyPushAddSubExt(*${root}, MRI, B, ${matchinfo}); }])>;
+
def AArch64PreLegalizerCombiner: GICombiner<
"AArch64PreLegalizerCombinerImpl", [all_combines,
fconstant_to_constant,
@@ -59,7 +69,8 @@ def AArch64PreLegalizerCombiner: GICombiner<
fold_global_offset,
shuffle_to_extract,
ext_addv_to_udot_addv,
- ext_uaddv_to_uaddlv]> {
+ ext_uaddv_to_uaddlv,
+ push_add_sub_through_ext]> {
let CombineAllMethodName = "tryCombineAllImpl";
}
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp
index a82d3cd095659b..d4d53a87154d73 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp
@@ -554,6 +554,74 @@ void applyExtUaddvToUaddlv(MachineInstr &MI, MachineRegisterInfo &MRI,
MI.eraseFromParent();
}
+// Pushes ADD/SUB through extend instructions to decrease the number of extend
+// instruction at the end by allowing selection of {s|u}addl sooner
+
+// i32 add(i32 ext i8, i32 ext i8) => i32 ext(i16 add(i16 ext i8, i16 ext i8))
+bool matchPushAddSubExt(
+ MachineInstr &MI, MachineRegisterInfo &MRI,
+ std::tuple<bool, Register, Register, Register> &matchinfo) {
+ assert(MI.getOpcode() == TargetOpcode::G_ADD ||
+ MI.getOpcode() == TargetOpcode::G_SUB &&
+ "Expected a G_ADD or G_SUB instruction\n");
+ MachineInstr *ExtMI1 = MRI.getVRegDef(MI.getOperand(1).getReg());
+ MachineInstr *ExtMI2 = MRI.getVRegDef(MI.getOperand(2).getReg());
+
+ LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
+ if (!DstTy.isVector())
+ return false;
+
+ // Check the source came from G_{S/Z}EXT instructions
+ if (ExtMI1->getOpcode() != ExtMI2->getOpcode() ||
+ (ExtMI1->getOpcode() != TargetOpcode::G_SEXT &&
+ ExtMI1->getOpcode() != TargetOpcode::G_ZEXT))
+ return false;
+
+ if (!MRI.hasOneUse(ExtMI1->getOperand(0).getReg()) ||
+ !MRI.hasOneUse(ExtMI2->getOperand(0).getReg()))
+ return false;
+
+ // Return true if G_{S|Z}EXT instruction is more than 2* source
+ Register ExtDstReg = MI.getOperand(1).getReg();
+ get<0>(matchinfo) = ExtMI1->getOpcode() == TargetOpcode::G_SEXT;
+ get<1>(matchinfo) = MI.getOperand(0).getReg();
+ get<2>(matchinfo) = ExtMI1->getOperand(1).getReg();
+ get<3>(matchinfo) = ExtMI2->getOperand(1).getReg();
+
+ LLT ExtDstTy = MRI.getType(ExtDstReg);
+ LLT Ext1SrcTy = MRI.getType(get<2>(matchinfo));
+ LLT Ext2SrcTy = MRI.getType(get<3>(matchinfo));
+
+ if (((Ext1SrcTy.getScalarSizeInBits() == 8 &&
+ ExtDstTy.getScalarSizeInBits() == 32) ||
+ ((Ext1SrcTy.getScalarSizeInBits() == 8 ||
+ Ext1SrcTy.getScalarSizeInBits() == 16) &&
+ ExtDstTy.getScalarSizeInBits() == 64)) &&
+ Ext1SrcTy == Ext2SrcTy)
+ return true;
+
+ return false;
+}
+
+void applyPushAddSubExt(
+ MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
+ std::tuple<bool, Register, Register, Register> &matchinfo) {
+ LLT SrcTy = MRI.getType(get<2>(matchinfo));
+ LLT MidTy = SrcTy.changeElementSize(SrcTy.getScalarSizeInBits() * 2);
+ unsigned Opc =
+ get<0>(matchinfo) ? TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT;
+ Register Ext1Reg = B.buildInstr(Opc, {MidTy}, {get<2>(matchinfo)}).getReg(0);
+ Register Ext2Reg = B.buildInstr(Opc, {MidTy}, {get<3>(matchinfo)}).getReg(0);
+ Register AddReg =
+ B.buildInstr(MI.getOpcode(), {MidTy}, {Ext1Reg, Ext2Reg}).getReg(0);
+ if (MI.getOpcode() == TargetOpcode::G_ADD)
+ B.buildInstr(Opc, {get<1>(matchinfo)}, {AddReg});
+ else
+ B.buildSExt(get<1>(matchinfo), AddReg);
+
+ MI.eraseFromParent();
+}
+
bool tryToSimplifyUADDO(MachineInstr &MI, MachineIRBuilder &B,
CombinerHelper &Helper, GISelChangeObserver &Observer) {
// Try simplify G_UADDO with 8 or 16 bit operands to wide G_ADD and TBNZ if
diff --git a/llvm/test/CodeGen/AArch64/aarch64-addv.ll b/llvm/test/CodeGen/AArch64/aarch64-addv.ll
index ee035ec1941d57..ff8a65c0c7924f 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-addv.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-addv.ll
@@ -97,17 +97,17 @@ define i32 @oversized_ADDV_256(ptr noalias nocapture readonly %arg1, ptr noalias
; GISEL-NEXT: ldr d1, [x0]
; GISEL-NEXT: ldr d2, [x1]
; GISEL-NEXT: movi v0.2d, #0000000000000000
-; GISEL-NEXT: ushll v1.8h, v1.8b, #0
-; GISEL-NEXT: ushll v2.8h, v2.8b, #0
-; GISEL-NEXT: usubl v3.4s, v1.4h, v2.4h
-; GISEL-NEXT: usubl2 v1.4s, v1.8h, v2.8h
-; GISEL-NEXT: cmgt v2.4s, v0.4s, v3.4s
-; GISEL-NEXT: cmgt v0.4s, v0.4s, v1.4s
-; GISEL-NEXT: neg v4.4s, v3.4s
-; GISEL-NEXT: neg v5.4s, v1.4s
-; GISEL-NEXT: bsl v2.16b, v4.16b, v3.16b
-; GISEL-NEXT: bsl v0.16b, v5.16b, v1.16b
-; GISEL-NEXT: add v0.4s, v2.4s, v0.4s
+; GISEL-NEXT: usubl v1.8h, v1.8b, v2.8b
+; GISEL-NEXT: sshll v2.4s, v1.4h, #0
+; GISEL-NEXT: sshll2 v3.4s, v1.8h, #0
+; GISEL-NEXT: cmgt v4.4s, v0.4s, v2.4s
+; GISEL-NEXT: cmgt v5.4s, v0.4s, v3.4s
+; GISEL-NEXT: neg v6.4s, v2.4s
+; GISEL-NEXT: ssubw2 v0.4s, v0.4s, v1.8h
+; GISEL-NEXT: mov v1.16b, v4.16b
+; GISEL-NEXT: bif v0.16b, v3.16b, v5.16b
+; GISEL-NEXT: bsl v1.16b, v6.16b, v2.16b
+; GISEL-NEXT: add v0.4s, v1.4s, v0.4s
; GISEL-NEXT: addv s0, v0.4s
; GISEL-NEXT: fmov w0, s0
; GISEL-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/arm64-vabs.ll b/llvm/test/CodeGen/AArch64/arm64-vabs.ll
index d64327656a9e01..ec667ca4bb45e0 100644
--- a/llvm/test/CodeGen/AArch64/arm64-vabs.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vabs.ll
@@ -290,27 +290,26 @@ define i32 @uabd16b_rdx_i32(<16 x i8> %a, <16 x i8> %b) {
;
; CHECK-GI-LABEL: uabd16b_rdx_i32:
; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: ushll.8h v3, v0, #0
-; CHECK-GI-NEXT: ushll.8h v4, v1, #0
-; CHECK-GI-NEXT: ushll2.8h v0, v0, #0
-; CHECK-GI-NEXT: ushll2.8h v1, v1, #0
+; CHECK-GI-NEXT: usubl.8h v3, v0, v1
+; CHECK-GI-NEXT: usubl2.8h v0, v0, v1
; CHECK-GI-NEXT: movi.2d v2, #0000000000000000
-; CHECK-GI-NEXT: usubl.4s v5, v3, v4
-; CHECK-GI-NEXT: usubl2.4s v3, v3, v4
-; CHECK-GI-NEXT: usubl.4s v4, v0, v1
-; CHECK-GI-NEXT: usubl2.4s v0, v0, v1
-; CHECK-GI-NEXT: cmgt.4s v1, v2, v5
-; CHECK-GI-NEXT: cmgt.4s v6, v2, v3
-; CHECK-GI-NEXT: neg.4s v16, v5
-; CHECK-GI-NEXT: cmgt.4s v7, v2, v4
-; CHECK-GI-NEXT: cmgt.4s v2, v2, v0
-; CHECK-GI-NEXT: neg.4s v17, v3
-; CHECK-GI-NEXT: neg.4s v18, v4
-; CHECK-GI-NEXT: neg.4s v19, v0
-; CHECK-GI-NEXT: bsl.16b v1, v16, v5
-; CHECK-GI-NEXT: bit.16b v3, v17, v6
-; CHECK-GI-NEXT: bit.16b v4, v18, v7
-; CHECK-GI-NEXT: bit.16b v0, v19, v2
+; CHECK-GI-NEXT: sshll2.4s v4, v3, #0
+; CHECK-GI-NEXT: sshll.4s v5, v0, #0
+; CHECK-GI-NEXT: sshll.4s v1, v3, #0
+; CHECK-GI-NEXT: sshll2.4s v6, v0, #0
+; CHECK-GI-NEXT: ssubw2.4s v3, v2, v3
+; CHECK-GI-NEXT: ssubw2.4s v0, v2, v0
+; CHECK-GI-NEXT: cmgt.4s v16, v2, v4
+; CHECK-GI-NEXT: cmgt.4s v18, v2, v5
+; CHECK-GI-NEXT: cmgt.4s v7, v2, v1
+; CHECK-GI-NEXT: neg.4s v17, v1
+; CHECK-GI-NEXT: cmgt.4s v2, v2, v6
+; CHECK-GI-NEXT: neg.4s v19, v5
+; CHECK-GI-NEXT: bif.16b v3, v4, v16
+; CHECK-GI-NEXT: mov.16b v4, v18
+; CHECK-GI-NEXT: bit.16b v1, v17, v7
+; CHECK-GI-NEXT: bif.16b v0, v6, v2
+; CHECK-GI-NEXT: bsl.16b v4, v19, v5
; CHECK-GI-NEXT: add.4s v1, v1, v3
; CHECK-GI-NEXT: add.4s v0, v4, v0
; CHECK-GI-NEXT: add.4s v0, v1, v0
@@ -338,27 +337,26 @@ define i32 @sabd16b_rdx_i32(<16 x i8> %a, <16 x i8> %b) {
;
; CHECK-GI-LABEL: sabd16b_rdx_i32:
; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: sshll.8h v3, v0, #0
-; CHECK-GI-NEXT: sshll.8h v4, v1, #0
-; CHECK-GI-NEXT: sshll2.8h v0, v0, #0
-; CHECK-GI-NEXT: sshll2.8h v1, v1, #0
+; CHECK-GI-NEXT: ssubl.8h v3, v0, v1
+; CHECK-GI-NEXT: ssubl2.8h v0, v0, v1
; CHECK-GI-NEXT: movi.2d v2, #0000000000000000
-; CHECK-GI-NEXT: ssubl.4s v5, v3, v4
-; CHECK-GI-NEXT: ssubl2.4s v3, v3, v4
-; CHECK-GI-NEXT: ssubl.4s v4, v0, v1
-; CHECK-GI-NEXT: ssubl2.4s v0, v0, v1
-; CHECK-GI-NEXT: cmgt.4s v1, v2, v5
-; CHECK-GI-NEXT: cmgt.4s v6, v2, v3
-; CHECK-GI-NEXT: neg.4s v16, v5
-; CHECK-GI-NEXT: cmgt.4s v7, v2, v4
-; CHECK-GI-NEXT: cmgt.4s v2, v2, v0
-; CHECK-GI-NEXT: neg.4s v17, v3
-; CHECK-GI-NEXT: neg.4s v18, v4
-; CHECK-GI-NEXT: neg.4s v19, v0
-; CHECK-GI-NEXT: bsl.16b v1, v16, v5
-; CHECK-GI-NEXT: bit.16b v3, v17, v6
-; CHECK-GI-NEXT: bit.16b v4, v18, v7
-; CHECK-GI-NEXT: bit.16b v0, v19, v2
+; CHECK-GI-NEXT: sshll2.4s v4, v3, #0
+; CHECK-GI-NEXT: sshll.4s v5, v0, #0
+; CHECK-GI-NEXT: sshll.4s v1, v3, #0
+; CHECK-GI-NEXT: sshll2.4s v6, v0, #0
+; CHECK-GI-NEXT: ssubw2.4s v3, v2, v3
+; CHECK-GI-NEXT: ssubw2.4s v0, v2, v0
+; CHECK-GI-NEXT: cmgt.4s v16, v2, v4
+; CHECK-GI-NEXT: cmgt.4s v18, v2, v5
+; CHECK-GI-NEXT: cmgt.4s v7, v2, v1
+; CHECK-GI-NEXT: neg.4s v17, v1
+; CHECK-GI-NEXT: cmgt.4s v2, v2, v6
+; CHECK-GI-NEXT: neg.4s v19, v5
+; CHECK-GI-NEXT: bif.16b v3, v4, v16
+; CHECK-GI-NEXT: mov.16b v4, v18
+; CHECK-GI-NEXT: bit.16b v1, v17, v7
+; CHECK-GI-NEXT: bif.16b v0, v6, v2
+; CHECK-GI-NEXT: bsl.16b v4, v19, v5
; CHECK-GI-NEXT: add.4s v1, v1, v3
; CHECK-GI-NEXT: add.4s v0, v4, v0
; CHECK-GI-NEXT: add.4s v0, v1, v0
diff --git a/llvm/test/CodeGen/AArch64/arm64-vadd.ll b/llvm/test/CodeGen/AArch64/arm64-vadd.ll
index 38a568ac919168..044c60500ff653 100644
--- a/llvm/test/CodeGen/AArch64/arm64-vadd.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vadd.ll
@@ -1193,6 +1193,366 @@ define <2 x i64> @ssubl2_duplhs(i32 %lhs, <4 x i32> %rhs) {
ret <2 x i64> %res
}
+define <8 x i32> @saddl_v8i8_v8i32(<8 x i8> %a, <8 x i8> %b) {
+; CHECK-SD-LABEL: saddl_v8i8_v8i32:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: saddl v0.8h, v0.8b, v1.8b
+; CHECK-SD-NEXT: sshll2 v1.4s, v0.8h, #0
+; CHECK-SD-NEXT: sshll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: saddl_v8i8_v8i32:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: saddl v1.8h, v0.8b, v1.8b
+; CHECK-GI-NEXT: sshll v0.4s, v1.4h, #0
+; CHECK-GI-NEXT: sshll2 v1.4s, v1.8h, #0
+; CHECK-GI-NEXT: ret
+ %c = sext <8 x i8> %a to <8 x i32>
+ %d = sext <8 x i8> %b to <8 x i32>
+ %e = add <8 x i32> %c, %d
+ ret <8 x i32> %e
+}
+
+define <8 x i64> @saddl_v8i8_v8i64(<8 x i8> %a, <8 x i8> %b) {
+; CHECK-SD-LABEL: saddl_v8i8_v8i64:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: saddl v0.8h, v0.8b, v1.8b
+; CHECK-SD-NEXT: sshll v1.4s, v0.4h, #0
+; CHECK-SD-NEXT: sshll2 v2.4s, v0.8h, #0
+; CHECK-SD-NEXT: sshll v0.2d, v1.2s, #0
+; CHECK-SD-NEXT: sshll2 v3.2d, v2.4s, #0
+; CHECK-SD-NEXT: sshll2 v1.2d, v1.4s, #0
+; CHECK-SD-NEXT: sshll v2.2d, v2.2s, #0
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: saddl_v8i8_v8i64:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: saddl v0.8h, v0.8b, v1.8b
+; CHECK-GI-NEXT: sshll v1.4s, v0.4h, #0
+; CHECK-GI-NEXT: sshll2 v3.4s, v0.8h, #0
+; CHECK-GI-NEXT: sshll v0.2d, v1.2s, #0
+; CHECK-GI-NEXT: sshll2 v1.2d, v1.4s, #0
+; CHECK-GI-NEXT: sshll v2.2d, v3.2s, #0
+; CHECK-GI-NEXT: sshll2 v3.2d, v3.4s, #0
+; CHECK-GI-NEXT: ret
+ %c = sext <8 x i8> %a to <8 x i64>
+ %d = sext <8 x i8> %b to <8 x i64>
+ %e = add <8 x i64> %c, %d
+ ret <8 x i64> %e
+}
+
+define <8 x i64> @saddl_v8i16_v8i64(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-SD-LABEL: saddl_v8i16_v8i64:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: saddl v2.4s, v0.4h, v1.4h
+; CHECK-SD-NEXT: saddl2 v4.4s, v0.8h, v1.8h
+; CHECK-SD-NEXT: sshll v0.2d, v2.2s, #0
+; CHECK-SD-NEXT: sshll2 v3.2d, v4.4s, #0
+; CHECK-SD-NEXT: sshll2 v1.2d, v2.4s, #0
+; CHECK-SD-NEXT: sshll v2.2d, v4.2s, #0
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: saddl_v8i16_v8i64:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: saddl v2.4s, v0.4h, v1.4h
+; CHECK-GI-NEXT: saddl2 v3.4s, v0.8h, v1.8h
+; CHECK-GI-NEXT: sshll v0.2d, v2.2s, #0
+; CHECK-GI-NEXT: sshll2 v1.2d, v2.4s, #0
+; CHECK-GI-NEXT: sshll v2.2d, v3.2s, #0
+; CHECK-GI-NEXT: sshll2 v3.2d, v3.4s, #0
+; CHECK-GI-NEXT: ret
+ %c = sext <8 x i16> %a to <8 x i64>
+ %d = sext <8 x i16> %b to <8 x i64>
+ %e = add <8 x i64> %c, %d
+ ret <8 x i64> %e
+}
+
+define <16 x i32> @saddl_v16i8_v16i32(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-SD-LABEL: saddl_v16i8_v16i32:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: saddl v2.8h, v0.8b, v1.8b
+; CHECK-SD-NEXT: saddl2 v4.8h, v0.16b, v1.16b
+; CHECK-SD-NEXT: sshll v0.4s, v2.4h, #0
+; CHECK-SD-NEXT: sshll2 v3.4s, v4.8h, #0
+; CHECK-SD-NEXT: sshll2 v1.4s, v2.8h, #0
+; CHECK-SD-NEXT: sshll v2.4s, v4.4h, #0
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: saddl_v16i8_v16i32:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: saddl v2.8h, v0.8b, v1.8b
+; CHECK-GI-NEXT: saddl2 v3.8h, v0.16b, v1.16b
+; CHECK-GI-NEXT: sshll v0.4s, v2.4h, #0
+; CHECK-GI-NEXT: sshll2 v1.4s, v2.8h, #0
+; CHECK-GI-NEXT: sshll v2.4s, v3.4h, #0
+; CHECK-GI-NEXT: sshll2 v3.4s, v3.8h, #0
+; CHECK-GI-NEXT: ret
+ %c = sext <16 x i8> %a to <16 x i32>
+ %d = sext <16 x i8> %b to <16 x i32>
+ %e = add <16 x i32> %c, %d
+ ret <16 x i32> %e
+}
+
+define <16 x i64> @saddl_v16i8_v16i64(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-SD-LABEL: saddl_v16i8_v16i64:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: saddl v2.8h, v0.8b, v1.8b
+; CHECK-SD-NEXT: saddl2 v0.8h, v0.16b, v1.16b
+; CHECK-SD-NEXT: sshll v3.4s, v2.4h, #0
+; CHECK-SD-NEXT: sshll2 v2.4s, v2.8h, #0
+; CHECK-SD-NEXT: sshll v5.4s, v0.4h, #0
+; CHECK-SD-NEXT: sshll2 v6.4s, v0.8h, #0
+; CHECK-SD-NEXT: sshll2 v1.2d, v3.4s, #0
+; CHECK-SD-NEXT: sshll v0.2d, v3.2s, #0
+; CHECK-SD-NEXT: sshll2 v3.2d, v2.4s, #0
+; CHECK-SD-NEXT: sshll v2.2d, v2.2s, #0
+; CHECK-SD-NEXT: sshll v4.2d, v5.2s, #0
+; CHECK-SD-NEXT: sshll2 v7.2d, v6.4s, #0
+; CHECK-SD-NEXT: sshll2 v5.2d, v5.4s, #0
+; CHECK-SD-NEXT: sshll v6.2d, v6.2s, #0
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: saddl_v16i8_v16i64:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: saddl v2.8h, v0.8b, v1.8b
+; CHECK-GI-NEXT: saddl2 v0.8h, v0.16b, v1.16b
+; CHECK-GI-NEXT: sshll v1.4s, v2.4h, #0
+; CHECK-GI-NEXT: sshll2 v3.4s, v2.8h, #0
+; CHECK-GI-NEXT: sshll v5.4s, v0.4h, #0
+; CHECK-GI-NEXT: sshll2 v7.4s, v0.8h, #0
+; CHECK-GI-NEXT: sshll v0.2d, v1.2s, #0
+; CHECK-GI-NEXT: sshll2 v1.2d, v1.4s, #0
+; CHECK-GI-NEXT: sshll v2.2d, v3.2s, #0
+; CHECK-GI-NEXT: sshll2 v3.2d, v3.4s, #0
+; CHECK-GI-NEXT: sshll v4.2d, v5.2s, #0
+; CHECK-GI-NEXT: sshll2 v5.2d, v5.4s, #0
+; CHECK-GI-NEXT: sshll v6.2d, v7.2s, #0
+; CHECK-GI-NEXT: sshll2 v7.2d, v7.4s, #0
+; CHECK-GI-NEXT: ret
+ %c = sext <16 x i8> %a to <16 x i64>
+ %d = sext <16 x i8> %b to <16 x i64>
+ %e = add <16 x i64> %c, %d
+ ret <16 x i64> %e
+}
+
+define <16 x i64> @saddl_v16i16_v16i64(<16 x i16> %a, <16 x i16> %b) {
+; CHECK-SD-LABEL: saddl_v16i16_v16i64:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: saddl v5.4s, v1.4h, v3.4h
+; CHECK-SD-NEXT: saddl v4.4s, v0.4h, v2.4h
+; CHECK-SD-NEXT: saddl2 v2.4s, v0.8h, v2.8h
+; CHECK-SD-NEXT: saddl2 v6.4s, v1.8h, v3.8h
+; CHECK-SD-NEXT: sshll2 v1.2d, v4.4s, #0
+; CHECK-SD-NEXT: sshll v0.2d, v4.2s, #0
+; CHECK-SD-NEXT: sshll2 v3.2d, v2.4s, #0
+; CHECK-SD-NEXT: sshll v2.2d, v2.2s, #0
+; CHECK-SD-NEXT: sshll v4.2d, v5.2s, #0
+; CHECK-SD-NEXT: sshll2 v7.2d, v6.4s, #0
+; CHECK-SD-NEXT: sshll2 v5.2d, v5.4s, #0
+; CHECK-SD-NEXT: sshll v6.2d, v6.2s, #0
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: saddl_v16i16_v16i64:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: saddl v4.4s, v0.4h, v2.4h
+; CHECK-GI-NEXT: saddl2 v5.4s, v0.8h, v2.8h
+; CHECK-GI-NEXT: saddl v6.4s, v1.4h, v3.4h
+; CHECK-GI-NEXT: saddl2 v7.4s, v1.8h, v3.8h
+; CHECK-GI-NEXT: sshll v0.2d, v4.2s, #0
+; CHECK-GI-NEXT: sshll2 v1.2d, v4.4s, #0
+; CHECK-GI-NEXT: sshll v2.2d, v5.2s, #0
+; CHECK-GI-NEXT: sshll2 v3.2d, v5.4s, #0
+; CHECK-GI-NEXT: sshll v4.2d, v6.2s, #0
+; CHECK-GI-NEXT: sshll2 v5.2d, v6.4s, #0
+; CHECK-GI-NEXT: sshll v6.2d, v7.2s, #0
+; CHECK-GI-NEXT: sshll2 v7.2d, v7.4s, #0
+; CHECK-GI-NEXT: ret
+ %c = sext <16 x i16> %a to <16 x i64>
+ %d = sext <16 x i16> %b to <16 x i64>
+ %e = add <16 x i64> %c, %d
+ ret <16 x i64> %e
+}
+
+define <8 x i32> @uaddl_v8i8_v8i32(<8 x i8> %a, <8 x i8> %b) {
+; CHECK-SD-LABEL: uaddl_v8i8_v8i32:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: uaddl v0.8h, v0.8b, v1.8b
+; CHECK-SD-NEXT: ushll2 v1.4s, v0.8h, #0
+; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: uaddl_v8i8_v8i32:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: uaddl v1.8h, v0.8b, v1.8b
+; CHECK-GI-NEXT: ushll v0.4s, v1.4h, #0
+; CHECK-GI-NEXT: ushll2 v1.4s, v1.8h, #0
+; CHECK-GI-NEXT: ret
+ %c = zext <8 x i8> %a to <8 x i32>
+ %d = zext <8 x i8> %b to <8 x i32>
+ %e = add <8 x i32> %c, %d
+ ret <8 x i32> %e
+}
+
+define <8 x i64> @uaddl_v8i8_v8i64(<8 x i8> %a, <8 x i8> %b) {
+; CHECK-SD-LABEL: uaddl_v8i8_v8i64:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: uaddl v0.8h, v0.8b, v1.8b
+; CHECK-SD-NEXT: ushll v1.4s, v0.4h, #0
+; CHECK-SD-NEXT: ushll2 v2.4s, v0.8h, #0
+; CHECK-SD-NEXT: ushll v0.2d, v1.2s, #0
+; CHECK-SD-NEXT: ushll2 v3.2d, v2.4s, #0
+; CHECK-SD-NEXT: ushll2 v1.2d, v1.4s, #0
+; CHECK-SD-NEXT: ushll v2.2d, v2.2s, #0
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: uaddl_v8i8_v8i64:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: uaddl v0.8h, v0.8b, v1.8b
+; CHECK-GI-NEXT: ushll v1.4s, v0.4h, #0
+; CHECK-GI-NEXT: ushll2 v3.4s, v0.8h, #0
+; CHECK-GI-NEXT: ushll v0.2d, v1.2s, #0
+; CHECK-GI-NEXT: ushll2 v1.2d, v1.4s, #0
+; CHECK-GI-NEXT: ushll v2.2d, v3.2s, #0
+; CHECK-GI-NEXT: ushll2 v3.2d, v3.4s, #0
+; CHECK-GI-NEXT: ret
+ %c = zext <8 x i8> %a to <8 x i64>
+ %d = zext <8 x i8> %b to <8 x i64>
+ %e = add <8 x i64> %c, %d
+ ret <8 x i64> %e
+}
+
+define <8 x i64> @uaddl_v8i16_v8i64(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-SD-LABEL: uaddl_v8i16_v8i64:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: uaddl v2.4s, v0.4h, v1.4h
+; CHECK-SD-NEXT: uaddl2 v4.4s, v0.8h, v1.8h
+; CHECK-SD-NEXT: ushll v0.2d, v2.2s, #0
+; CHECK-SD-NEXT: ushll2 v3.2d, v4.4s, #0
+; CHECK-SD-NEXT: ushll2 v1.2d, v2.4s, #0
+; CHECK-SD-NEXT: ushll v2.2d, v4.2s, #0
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: uaddl_v8i16_v8i64:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: uaddl v2.4s, v0.4h, v1.4h
+; CHECK-GI-NEXT: uaddl2 v3.4s, v0.8h, v1.8h
+; CHECK-GI-NEXT: ushll v0.2d, v2.2s, #0
+; CHECK-GI-NEXT: ushll2 v1.2d, v2.4s, #0
+; CHECK-GI-NEXT: ushll v2.2d, v3.2s, #0
+; CHECK-GI-NEXT: ushll2 v3.2d, v3.4s, #0
+; CHECK-GI-NEXT: ret
+ %c = zext <8 x i16> %a to <8 x i64>
+ %d = zext <8 x i16> %b to <8 x i64>
+ %e = add <8 x i64> %c, %d
+ ret <8 x i64> %e
+}
+
+define <16 x i32> @uaddl_v16i8_v16i32(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-SD-LABEL: uaddl_v16i8_v16i32:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: uaddl v2.8h, v0.8b, v1.8b
+; CHECK-SD-NEXT: uaddl2 v4.8h, v0.16b, v1.16b
+; CHECK-SD-NEXT: ushll v0.4s, v2.4h, #0
+; CHECK-SD-NEXT: ushll2 v3.4s, v4.8h, #0
+;...
[truncated]
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'd like a MIR test for this as well.
// Check the source came from G_{S/Z}EXT instructions | ||
if (ExtMI1->getOpcode() != ExtMI2->getOpcode() || | ||
(ExtMI1->getOpcode() != TargetOpcode::G_SEXT && | ||
ExtMI1->getOpcode() != TargetOpcode::G_ZEXT)) | ||
return false; | ||
|
||
if (!MRI.hasOneUse(ExtMI1->getOperand(0).getReg()) || | ||
!MRI.hasOneUse(ExtMI2->getOperand(0).getReg())) | ||
return false; | ||
|
||
// Return true if G_{S|Z}EXT instruction is more than 2* source |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Could you use MIPattern for these?
a1c53b5
to
ed587b5
Compare
bool ZExt = | ||
mi_match(Src1Reg, MRI, | ||
m_OneNonDBGUse(m_GZExt(m_Reg(get<2>(matchinfo))))) && | ||
mi_match(Src2Reg, MRI, m_OneNonDBGUse(m_GZExt(m_Reg(get<3>(matchinfo))))); | ||
bool SExt = | ||
mi_match(Src1Reg, MRI, | ||
m_OneNonDBGUse(m_GSExt(m_Reg(get<2>(matchinfo))))) && | ||
mi_match(Src2Reg, MRI, m_OneNonDBGUse(m_GSExt(m_Reg(get<3>(matchinfo))))); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you push this into the tablegen pattern?
GIDefMatchData<"std::tuple<bool, Register, Register, Register>">; | ||
def push_add_sub_through_ext : GICombineRule< | ||
(defs root:$root, push_add_matchinfo:$matchinfo), | ||
(match (wip_match_opcode G_ADD, G_SUB):$root, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
(match (G_SEXT $lhs, $s1),
(G_SEXT $rhs, $s2),
(G_ADD $root, $lhs, $rhs),
[[ return mathPushAddSubExt(${root}, ${lhs}, ${rhs}, ${s1}, ${s2}, MRI, ${matchinfo}); }]),
Why is the combine rooted on add/sub and not on abs? |
ed587b5
to
b91b94e
Compare
✅ With the latest revision this PR passed the C/C++ code formatter. |
b91b94e
to
30eb02d
Compare
LLT ExtDstTy = MRI.getType(ExtDstReg); | ||
LLT Ext1SrcTy = MRI.getType(SrcReg1); | ||
LLT Ext2SrcTy = MRI.getType(SrcReg2); | ||
if (((Ext1SrcTy.getScalarSizeInBits() == 8 && | ||
ExtDstTy.getScalarSizeInBits() == 32) || | ||
((Ext1SrcTy.getScalarSizeInBits() == 8 || | ||
Ext1SrcTy.getScalarSizeInBits() == 16) && | ||
ExtDstTy.getScalarSizeInBits() == 64)) && | ||
Ext1SrcTy == Ext2SrcTy) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you rephrase the type constraints more generally? Like with free zext or something? And move to generic combines?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't think there is much in terms of free instructions in this combine. This is the GISel equivalent of https://reviews.llvm.org/D128426, which always felt quite AArch64 specific to me, due to the way the saddl/saddl2/sshll instructions are legalized when extending types. Splitting the type up early can be profitable, but I'm not sure how to generally say that would be the case on different architectures.
@@ -1193,6 +1193,366 @@ define <2 x i64> @ssubl2_duplhs(i32 %lhs, <4 x i32> %rhs) { | |||
ret <2 x i64> %res | |||
} | |||
|
|||
define <8 x i32> @saddl_v8i8_v8i32(<8 x i8> %a, <8 x i8> %b) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can this re-use the tests in llvm/test/CodeGen/AArch64/neon-extadd.ll, with GISel coverage?
LLT ExtDstTy = MRI.getType(ExtDstReg); | ||
LLT Ext1SrcTy = MRI.getType(SrcReg1); | ||
LLT Ext2SrcTy = MRI.getType(SrcReg2); | ||
if (((Ext1SrcTy.getScalarSizeInBits() == 8 && | ||
ExtDstTy.getScalarSizeInBits() == 32) || | ||
((Ext1SrcTy.getScalarSizeInBits() == 8 || | ||
Ext1SrcTy.getScalarSizeInBits() == 16) && | ||
ExtDstTy.getScalarSizeInBits() == 64)) && | ||
Ext1SrcTy == Ext2SrcTy) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't think there is much in terms of free instructions in this combine. This is the GISel equivalent of https://reviews.llvm.org/D128426, which always felt quite AArch64 specific to me, due to the way the saddl/saddl2/sshll instructions are legalized when extending types. Splitting the type up early can be profitable, but I'm not sure how to generally say that would be the case on different architectures.
Register Ext2Reg = B.buildInstr(Opc, {MidTy}, {SrcReg2}).getReg(0); | ||
Register AddReg = | ||
B.buildInstr(MI.getOpcode(), {MidTy}, {Ext1Reg, Ext2Reg}).getReg(0); | ||
if (MI.getOpcode() == TargetOpcode::G_ADD) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you add a comment explaining the outer extends.
LLT ExtDstTy = MRI.getType(ExtDstReg); | ||
LLT Ext1SrcTy = MRI.getType(SrcReg1); | ||
LLT Ext2SrcTy = MRI.getType(SrcReg2); | ||
if (((Ext1SrcTy.getScalarSizeInBits() == 8 && |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Perhaps pull the getScalarSizeInBits out into a variable.
30eb02d
to
e3e0f1e
Compare
def push_sub_through_zext : push_opcode_through_ext<G_SUB, G_ZEXT>; | ||
def push_add_through_zext : push_opcode_through_ext<G_ADD, G_ZEXT>; | ||
def push_sub_through_sext : push_opcode_through_ext<G_SUB, G_SEXT>; | ||
def push_add_through_sext : push_opcode_through_ext<G_ADD, G_SEXT>; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nice!
B.buildInstr(MI.getOpcode(), {MidTy}, {Ext1Reg, Ext2Reg}).getReg(0); | ||
|
||
// G_SUB has to sign-extend the result. | ||
// G_ADD needs to zext from zext and can sext or zext from sext, so the |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Alive2 disagrees that zext can be used after sext. https://alive2.llvm.org/ce/z/74UedG
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sorry - I suggested that wording when chatting with Chuong earlier, and typed it out the wrong way around. https://alive2.llvm.org/ce/z/FBkqxN
e3e0f1e
to
aa76c25
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
i32 add(i32 ext i8, i32 ext i8) => i32 ext(i16 add(i16 ext i8, i16 ext i8)) Reduces the amount of shift instructions generated by selecting {s|u}addl instruction as early as possible During instruction selection the result will be selected as: i32 ext (i16 uaddl i8, i8) Instead of: i32 uaddl (i16 ext i8, i16 ext i8)
aa76c25
to
b770cff
Compare
The regression in one test is due to a SUB instruction being pushed through the extend, leaving behind the abs instruction, which prevents it from selecting uabdl instructions shown below:
i32 abs(i32 sub(i32 ext i8, i32 ext i8))
=>i32 abs(i32 ext(i16 sub(i16 ext i8, i16 ext i8)))
This is intended to be fixed in a follow up patch