diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp index 476d99c2a7e04..852a4a4f76810 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -4275,6 +4275,58 @@ bool AArch64DAGToDAGISel::trySelectXAR(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); + EVT VT = N->getValueType(0); + + // Essentially: rotr (xor(x, y), imm) -> xar (x, y, imm) + // Rotate by a constant is a funnel shift in IR which is exanded to + // an OR with shifted operands. + // We do the following transform: + // OR N0, N1 -> xar (x, y, imm) + // Where: + // N1 = SRL_PRED true, V, splat(imm) --> rotr amount + // N0 = SHL_PRED true, V, splat(bits-imm) + // V = (xor x, y) + if (VT.isScalableVector() && Subtarget->hasSVE2orSME()) { + if (N0.getOpcode() != AArch64ISD::SHL_PRED || + N1.getOpcode() != AArch64ISD::SRL_PRED) + std::swap(N0, N1); + if (N0.getOpcode() != AArch64ISD::SHL_PRED || + N1.getOpcode() != AArch64ISD::SRL_PRED) + return false; + + auto *TLI = static_cast(getTargetLowering()); + if (!TLI->isAllActivePredicate(*CurDAG, N0.getOperand(0)) || + !TLI->isAllActivePredicate(*CurDAG, N1.getOperand(0))) + return false; + + SDValue XOR = N0.getOperand(1); + if (XOR.getOpcode() != ISD::XOR || XOR != N1.getOperand(1)) + return false; + + APInt ShlAmt, ShrAmt; + if (!ISD::isConstantSplatVector(N0.getOperand(2).getNode(), ShlAmt) || + !ISD::isConstantSplatVector(N1.getOperand(2).getNode(), ShrAmt)) + return false; + + if (ShlAmt + ShrAmt != VT.getScalarSizeInBits()) + return false; + + SDLoc DL(N); + SDValue Imm = + CurDAG->getTargetConstant(ShrAmt.getZExtValue(), DL, MVT::i32); + + SDValue Ops[] = {XOR.getOperand(0), XOR.getOperand(1), Imm}; + if (auto Opc = SelectOpcodeFromVT( + VT, {AArch64::XAR_ZZZI_B, AArch64::XAR_ZZZI_H, AArch64::XAR_ZZZI_S, + AArch64::XAR_ZZZI_D})) { + CurDAG->SelectNodeTo(N, Opc, VT, Ops); + return true; + } + return false; + } + + if (!Subtarget->hasSHA3()) + return false; if (N0->getOpcode() != AArch64ISD::VSHL || N1->getOpcode() != AArch64ISD::VLSHR) @@ -4367,7 +4419,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { case ISD::OR: if (tryBitfieldInsertOp(Node)) return; - if (Subtarget->hasSHA3() && trySelectXAR(Node)) + if (trySelectXAR(Node)) return; break; diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h index b17e215e200de..a131cf8a6f540 100644 --- a/llvm/lib/Target/AArch64/AArch64Subtarget.h +++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h @@ -394,6 +394,7 @@ class AArch64Subtarget final : public AArch64GenSubtargetInfo { void mirFileLoaded(MachineFunction &MF) const override; bool hasSVEorSME() const { return hasSVE() || hasSME(); } + bool hasSVE2orSME() const { return hasSVE2() || hasSME(); } // Return the known range for the bit length of SVE data registers. A value // of 0 means nothing is known about that particular limit beyong what's diff --git a/llvm/test/CodeGen/AArch64/sve2-xar.ll b/llvm/test/CodeGen/AArch64/sve2-xar.ll new file mode 100644 index 0000000000000..62680522bab93 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve2-xar.ll @@ -0,0 +1,240 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc -mtriple=aarch64 -mattr=+sve < %s -o - | FileCheck --check-prefixes=CHECK,SVE %s +; RUN: llc -mtriple=aarch64 -mattr=+sve2 < %s -o - | FileCheck --check-prefixes=CHECK,SVE2 %s + +define @xar_nxv2i64_l( %x, %y) { +; SVE-LABEL: xar_nxv2i64_l: +; SVE: // %bb.0: +; SVE-NEXT: eor z0.d, z0.d, z1.d +; SVE-NEXT: lsr z1.d, z0.d, #4 +; SVE-NEXT: lsl z0.d, z0.d, #60 +; SVE-NEXT: orr z0.d, z0.d, z1.d +; SVE-NEXT: ret +; +; SVE2-LABEL: xar_nxv2i64_l: +; SVE2: // %bb.0: +; SVE2-NEXT: xar z0.d, z0.d, z1.d, #4 +; SVE2-NEXT: ret + %a = xor %x, %y + %b = call @llvm.fshl.nxv2i64( %a, %a, splat (i64 60)) + ret %b +} + +define @xar_nxv2i64_r( %x, %y) { +; SVE-LABEL: xar_nxv2i64_r: +; SVE: // %bb.0: +; SVE-NEXT: eor z0.d, z0.d, z1.d +; SVE-NEXT: lsl z1.d, z0.d, #60 +; SVE-NEXT: lsr z0.d, z0.d, #4 +; SVE-NEXT: orr z0.d, z0.d, z1.d +; SVE-NEXT: ret +; +; SVE2-LABEL: xar_nxv2i64_r: +; SVE2: // %bb.0: +; SVE2-NEXT: xar z0.d, z0.d, z1.d, #4 +; SVE2-NEXT: ret + %a = xor %x, %y + %b = call @llvm.fshr.nxv2i64( %a, %a, splat (i64 4)) + ret %b +} + + +define @xar_nxv4i32_l( %x, %y) { +; SVE-LABEL: xar_nxv4i32_l: +; SVE: // %bb.0: +; SVE-NEXT: eor z0.d, z0.d, z1.d +; SVE-NEXT: lsr z1.s, z0.s, #4 +; SVE-NEXT: lsl z0.s, z0.s, #28 +; SVE-NEXT: orr z0.d, z0.d, z1.d +; SVE-NEXT: ret +; +; SVE2-LABEL: xar_nxv4i32_l: +; SVE2: // %bb.0: +; SVE2-NEXT: xar z0.s, z0.s, z1.s, #4 +; SVE2-NEXT: ret + %a = xor %x, %y + %b = call @llvm.fshl.nxv4i32( %a, %a, splat (i32 28)) + ret %b +} + +define @xar_nxv4i32_r( %x, %y) { +; SVE-LABEL: xar_nxv4i32_r: +; SVE: // %bb.0: +; SVE-NEXT: eor z0.d, z0.d, z1.d +; SVE-NEXT: lsl z1.s, z0.s, #28 +; SVE-NEXT: lsr z0.s, z0.s, #4 +; SVE-NEXT: orr z0.d, z0.d, z1.d +; SVE-NEXT: ret +; +; SVE2-LABEL: xar_nxv4i32_r: +; SVE2: // %bb.0: +; SVE2-NEXT: xar z0.s, z0.s, z1.s, #4 +; SVE2-NEXT: ret + %a = xor %x, %y + %b = call @llvm.fshr.nxv4i32( %a, %a, splat (i32 4)) + ret %b +} + +define @xar_nxv8i16_l( %x, %y) { +; SVE-LABEL: xar_nxv8i16_l: +; SVE: // %bb.0: +; SVE-NEXT: eor z0.d, z0.d, z1.d +; SVE-NEXT: lsr z1.h, z0.h, #4 +; SVE-NEXT: lsl z0.h, z0.h, #12 +; SVE-NEXT: orr z0.d, z0.d, z1.d +; SVE-NEXT: ret +; +; SVE2-LABEL: xar_nxv8i16_l: +; SVE2: // %bb.0: +; SVE2-NEXT: xar z0.h, z0.h, z1.h, #4 +; SVE2-NEXT: ret + %a = xor %x, %y + %b = call @llvm.fshl.nxv8i16( %a, %a, splat (i16 12)) + ret %b +} + +define @xar_nxv8i16_r( %x, %y) { +; SVE-LABEL: xar_nxv8i16_r: +; SVE: // %bb.0: +; SVE-NEXT: eor z0.d, z0.d, z1.d +; SVE-NEXT: lsl z1.h, z0.h, #12 +; SVE-NEXT: lsr z0.h, z0.h, #4 +; SVE-NEXT: orr z0.d, z0.d, z1.d +; SVE-NEXT: ret +; +; SVE2-LABEL: xar_nxv8i16_r: +; SVE2: // %bb.0: +; SVE2-NEXT: xar z0.h, z0.h, z1.h, #4 +; SVE2-NEXT: ret + %a = xor %x, %y + %b = call @llvm.fshr.nxv8i16( %a, %a, splat (i16 4)) + ret %b +} + +define @xar_nxv16i8_l( %x, %y) { +; SVE-LABEL: xar_nxv16i8_l: +; SVE: // %bb.0: +; SVE-NEXT: eor z0.d, z0.d, z1.d +; SVE-NEXT: lsr z1.b, z0.b, #4 +; SVE-NEXT: lsl z0.b, z0.b, #4 +; SVE-NEXT: orr z0.d, z0.d, z1.d +; SVE-NEXT: ret +; +; SVE2-LABEL: xar_nxv16i8_l: +; SVE2: // %bb.0: +; SVE2-NEXT: xar z0.b, z0.b, z1.b, #4 +; SVE2-NEXT: ret + %a = xor %x, %y + %b = call @llvm.fshl.nxv16i8( %a, %a, splat (i8 4)) + ret %b +} + +define @xar_nxv16i8_r( %x, %y) { +; SVE-LABEL: xar_nxv16i8_r: +; SVE: // %bb.0: +; SVE-NEXT: eor z0.d, z0.d, z1.d +; SVE-NEXT: lsl z1.b, z0.b, #4 +; SVE-NEXT: lsr z0.b, z0.b, #4 +; SVE-NEXT: orr z0.d, z0.d, z1.d +; SVE-NEXT: ret +; +; SVE2-LABEL: xar_nxv16i8_r: +; SVE2: // %bb.0: +; SVE2-NEXT: xar z0.b, z0.b, z1.b, #4 +; SVE2-NEXT: ret + %a = xor %x, %y + %b = call @llvm.fshr.nxv16i8( %a, %a, splat (i8 4)) + ret %b +} + +; Shift is not a constant. +define @xar_nxv2i64_l_neg1( %x, %y, %z) { +; CHECK-LABEL: xar_nxv2i64_l_neg1: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z3.d, z2.d +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: subr z2.d, z2.d, #0 // =0x0 +; CHECK-NEXT: eor z0.d, z0.d, z1.d +; CHECK-NEXT: and z2.d, z2.d, #0x3f +; CHECK-NEXT: and z3.d, z3.d, #0x3f +; CHECK-NEXT: movprfx z1, z0 +; CHECK-NEXT: lsl z1.d, p0/m, z1.d, z3.d +; CHECK-NEXT: lsr z0.d, p0/m, z0.d, z2.d +; CHECK-NEXT: orr z0.d, z1.d, z0.d +; CHECK-NEXT: ret + %a = xor %x, %y + %b = call @llvm.fshl.nxv2i64( %a, %a, %z) + ret %b +} + +; OR instead of an XOR. +; TODO: We could use usra instruction here for SVE2. +define @xar_nxv2i64_l_neg2( %x, %y) { +; CHECK-LABEL: xar_nxv2i64_l_neg2: +; CHECK: // %bb.0: +; CHECK-NEXT: orr z0.d, z0.d, z1.d +; CHECK-NEXT: lsr z1.d, z0.d, #4 +; CHECK-NEXT: lsl z0.d, z0.d, #60 +; CHECK-NEXT: orr z0.d, z0.d, z1.d +; CHECK-NEXT: ret + %a = or %x, %y + %b = call @llvm.fshl.nxv2i64( %a, %a, splat (i64 60)) + ret %b +} + +; Rotate amount is 0. +define @xar_nxv2i64_l_neg3( %x, %y) { +; CHECK-LABEL: xar_nxv2i64_l_neg3: +; CHECK: // %bb.0: +; CHECK-NEXT: eor z0.d, z0.d, z1.d +; CHECK-NEXT: ret + %a = xor %x, %y + %b = call @llvm.fshl.nxv2i64( %a, %a, splat (i64 64)) + ret %b +} + +; Uses individual shifts instead of funnel shifts, just one test. +define @xar_nxv2i64_shifts( %x, %y) { +; SVE-LABEL: xar_nxv2i64_shifts: +; SVE: // %bb.0: +; SVE-NEXT: eor z0.d, z0.d, z1.d +; SVE-NEXT: lsr z1.d, z0.d, #4 +; SVE-NEXT: lsl z0.d, z0.d, #60 +; SVE-NEXT: orr z0.d, z0.d, z1.d +; SVE-NEXT: ret +; +; SVE2-LABEL: xar_nxv2i64_shifts: +; SVE2: // %bb.0: +; SVE2-NEXT: xar z0.d, z0.d, z1.d, #4 +; SVE2-NEXT: ret + %xor = xor %x, %y + %shl = shl %xor, splat (i64 60) + %shr = lshr %xor, splat (i64 4) + %or = or %shl, %shr + ret %or +} + +; Not a rotate operation as 60 + 3 != 64 +define @xar_nxv2i64_shifts_neg( %x, %y) { +; CHECK-LABEL: xar_nxv2i64_shifts_neg: +; CHECK: // %bb.0: +; CHECK-NEXT: eor z0.d, z0.d, z1.d +; CHECK-NEXT: lsl z1.d, z0.d, #60 +; CHECK-NEXT: lsr z0.d, z0.d, #3 +; CHECK-NEXT: orr z0.d, z1.d, z0.d +; CHECK-NEXT: ret + %xor = xor %x, %y + %shl = shl %xor, splat (i64 60) + %shr = lshr %xor, splat (i64 3) + %or = or %shl, %shr + ret %or +} + +declare @llvm.fshl.nxv2i64(, , ) +declare @llvm.fshl.nxv4i32(, , ) +declare @llvm.fshl.nxv8i16(, , ) +declare @llvm.fshl.nxv16i8(, , ) +declare @llvm.fshr.nxv2i64(, , ) +declare @llvm.fshr.nxv4i32(, , ) +declare @llvm.fshr.nxv8i16(, , ) +declare @llvm.fshr.nxv16i8(, , )