diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PostSelectOptimize.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PostSelectOptimize.cpp index 11866f2dd1864..e9aed60595e68 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64PostSelectOptimize.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64PostSelectOptimize.cpp @@ -48,6 +48,7 @@ class AArch64PostSelectOptimize : public MachineFunctionPass { bool doPeepholeOpts(MachineBasicBlock &MBB); /// Look for cross regclass copies that can be trivially eliminated. bool foldSimpleCrossClassCopies(MachineInstr &MI); + bool foldCopyDup(MachineInstr &MI); }; } // end anonymous namespace @@ -105,7 +106,10 @@ unsigned getNonFlagSettingVariant(unsigned Opc) { bool AArch64PostSelectOptimize::doPeepholeOpts(MachineBasicBlock &MBB) { bool Changed = false; for (auto &MI : make_early_inc_range(make_range(MBB.begin(), MBB.end()))) { - Changed |= foldSimpleCrossClassCopies(MI); + bool CurrentIterChanged = foldSimpleCrossClassCopies(MI); + if (!CurrentIterChanged) + CurrentIterChanged |= foldCopyDup(MI); + Changed |= CurrentIterChanged; } return Changed; } @@ -158,6 +162,68 @@ bool AArch64PostSelectOptimize::foldSimpleCrossClassCopies(MachineInstr &MI) { return true; } +bool AArch64PostSelectOptimize::foldCopyDup(MachineInstr &MI) { + if (!MI.isCopy()) + return false; + + auto *MF = MI.getMF(); + auto &MRI = MF->getRegInfo(); + auto *TII = MF->getSubtarget().getInstrInfo(); + + // Optimize COPY(y:GPR, DUP(x:FPR, i)) -> UMOV(y:GPR, x:FPR, i). + // Here Dst is y and Src is the result of DUP. + Register Dst = MI.getOperand(0).getReg(); + Register Src = MI.getOperand(1).getReg(); + + if (!Dst.isVirtual() || !Src.isVirtual()) + return false; + + auto TryMatchDUP = [&](const TargetRegisterClass *GPRRegClass, + const TargetRegisterClass *FPRRegClass, unsigned DUP, + unsigned UMOV) { + if (MRI.getRegClassOrNull(Dst) != GPRRegClass || + MRI.getRegClassOrNull(Src) != FPRRegClass) + return false; + + // There is a special case when one of the uses is COPY(z:FPR, y:GPR). + // In this case, we get COPY(z:FPR, COPY(y:GPR, DUP(x:FPR, i))), which can + // be folded by peephole-opt into just DUP(z:FPR, i), so this transform is + // not worthwhile in that case. + for (auto &Use : MRI.use_nodbg_instructions(Dst)) { + if (!Use.isCopy()) + continue; + + Register UseOp0 = Use.getOperand(0).getReg(); + Register UseOp1 = Use.getOperand(1).getReg(); + if (UseOp0.isPhysical() || UseOp1.isPhysical()) + return false; + + if (MRI.getRegClassOrNull(UseOp0) == FPRRegClass && + MRI.getRegClassOrNull(UseOp1) == GPRRegClass) + return false; + } + + MachineInstr *SrcMI = MRI.getUniqueVRegDef(Src); + if (!SrcMI || SrcMI->getOpcode() != DUP || !MRI.hasOneNonDBGUse(Src)) + return false; + + Register DupSrc = SrcMI->getOperand(1).getReg(); + int64_t DupImm = SrcMI->getOperand(2).getImm(); + + BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(UMOV), Dst) + .addReg(DupSrc) + .addImm(DupImm); + SrcMI->eraseFromParent(); + MI.eraseFromParent(); + return true; + }; + + return TryMatchDUP(&AArch64::GPR32RegClass, &AArch64::FPR32RegClass, + AArch64::DUPi32, AArch64::UMOVvi32) || + TryMatchDUP(&AArch64::GPR64RegClass, &AArch64::FPR64RegClass, + AArch64::DUPi64, AArch64::UMOVvi64); +} + bool AArch64PostSelectOptimize::optimizeNZCVDefs(MachineBasicBlock &MBB) { // If we find a dead NZCV implicit-def, we // - try to convert the operation to a non-flag-setting equivalent diff --git a/llvm/test/CodeGen/AArch64/aarch64-mulv.ll b/llvm/test/CodeGen/AArch64/aarch64-mulv.ll index 7b7ca9d8ffc2d..e11ae9a251590 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-mulv.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-mulv.ll @@ -25,22 +25,13 @@ declare i64 @llvm.vector.reduce.mul.v4i64(<4 x i64>) declare i128 @llvm.vector.reduce.mul.v2i128(<2 x i128>) define i8 @mulv_v2i8(<2 x i8> %a) { -; CHECK-SD-LABEL: mulv_v2i8: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-SD-NEXT: mov w8, v0.s[1] -; CHECK-SD-NEXT: fmov w9, s0 -; CHECK-SD-NEXT: mul w0, w9, w8 -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: mulv_v2i8: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-GI-NEXT: mov s1, v0.s[1] -; CHECK-GI-NEXT: fmov w8, s0 -; CHECK-GI-NEXT: fmov w9, s1 -; CHECK-GI-NEXT: mul w0, w8, w9 -; CHECK-GI-NEXT: ret +; CHECK-LABEL: mulv_v2i8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: mov w8, v0.s[1] +; CHECK-NEXT: fmov w9, s0 +; CHECK-NEXT: mul w0, w9, w8 +; CHECK-NEXT: ret entry: %arg1 = call i8 @llvm.vector.reduce.mul.v2i8(<2 x i8> %a) ret i8 %arg1 @@ -230,22 +221,13 @@ entry: } define i16 @mulv_v2i16(<2 x i16> %a) { -; CHECK-SD-LABEL: mulv_v2i16: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-SD-NEXT: mov w8, v0.s[1] -; CHECK-SD-NEXT: fmov w9, s0 -; CHECK-SD-NEXT: mul w0, w9, w8 -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: mulv_v2i16: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-GI-NEXT: mov s1, v0.s[1] -; CHECK-GI-NEXT: fmov w8, s0 -; CHECK-GI-NEXT: fmov w9, s1 -; CHECK-GI-NEXT: mul w0, w8, w9 -; CHECK-GI-NEXT: ret +; CHECK-LABEL: mulv_v2i16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: mov w8, v0.s[1] +; CHECK-NEXT: fmov w9, s0 +; CHECK-NEXT: mul w0, w9, w8 +; CHECK-NEXT: ret entry: %arg1 = call i16 @llvm.vector.reduce.mul.v2i16(<2 x i16> %a) ret i16 %arg1 @@ -372,22 +354,13 @@ entry: } define i32 @mulv_v2i32(<2 x i32> %a) { -; CHECK-SD-LABEL: mulv_v2i32: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-SD-NEXT: mov w8, v0.s[1] -; CHECK-SD-NEXT: fmov w9, s0 -; CHECK-SD-NEXT: mul w0, w9, w8 -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: mulv_v2i32: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-GI-NEXT: mov s1, v0.s[1] -; CHECK-GI-NEXT: fmov w8, s0 -; CHECK-GI-NEXT: fmov w9, s1 -; CHECK-GI-NEXT: mul w0, w8, w9 -; CHECK-GI-NEXT: ret +; CHECK-LABEL: mulv_v2i32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: mov w8, v0.s[1] +; CHECK-NEXT: fmov w9, s0 +; CHECK-NEXT: mul w0, w9, w8 +; CHECK-NEXT: ret entry: %arg1 = call i32 @llvm.vector.reduce.mul.v2i32(<2 x i32> %a) ret i32 %arg1 @@ -424,10 +397,9 @@ define i32 @mulv_v4i32(<4 x i32> %a) { ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: mov d1, v0.d[1] ; CHECK-GI-NEXT: mul v0.2s, v0.2s, v1.2s -; CHECK-GI-NEXT: mov s1, v0.s[1] -; CHECK-GI-NEXT: fmov w8, s0 -; CHECK-GI-NEXT: fmov w9, s1 -; CHECK-GI-NEXT: mul w0, w8, w9 +; CHECK-GI-NEXT: mov w8, v0.s[1] +; CHECK-GI-NEXT: fmov w9, s0 +; CHECK-GI-NEXT: mul w0, w9, w8 ; CHECK-GI-NEXT: ret entry: %arg1 = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %a) @@ -452,10 +424,9 @@ define i32 @mulv_v8i32(<8 x i32> %a) { ; CHECK-GI-NEXT: mul v0.2s, v0.2s, v2.2s ; CHECK-GI-NEXT: mul v1.2s, v1.2s, v3.2s ; CHECK-GI-NEXT: mul v0.2s, v0.2s, v1.2s -; CHECK-GI-NEXT: mov s1, v0.s[1] -; CHECK-GI-NEXT: fmov w8, s0 -; CHECK-GI-NEXT: fmov w9, s1 -; CHECK-GI-NEXT: mul w0, w8, w9 +; CHECK-GI-NEXT: mov w8, v0.s[1] +; CHECK-GI-NEXT: fmov w9, s0 +; CHECK-GI-NEXT: mul w0, w9, w8 ; CHECK-GI-NEXT: ret entry: %arg1 = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> %a) @@ -463,20 +434,12 @@ entry: } define i64 @mulv_v2i64(<2 x i64> %a) { -; CHECK-SD-LABEL: mulv_v2i64: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: mov x8, v0.d[1] -; CHECK-SD-NEXT: fmov x9, d0 -; CHECK-SD-NEXT: mul x0, x9, x8 -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: mulv_v2i64: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: mov d1, v0.d[1] -; CHECK-GI-NEXT: fmov x8, d0 -; CHECK-GI-NEXT: fmov x9, d1 -; CHECK-GI-NEXT: mul x0, x8, x9 -; CHECK-GI-NEXT: ret +; CHECK-LABEL: mulv_v2i64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov x8, v0.d[1] +; CHECK-NEXT: fmov x9, d0 +; CHECK-NEXT: mul x0, x9, x8 +; CHECK-NEXT: ret entry: %arg1 = call i64 @llvm.vector.reduce.mul.v2i64(<2 x i64> %a) ret i64 %arg1 @@ -522,14 +485,12 @@ define i64 @mulv_v4i64(<4 x i64> %a) { ; ; CHECK-GI-LABEL: mulv_v4i64: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: mov d2, v0.d[1] -; CHECK-GI-NEXT: mov d3, v1.d[1] -; CHECK-GI-NEXT: fmov x8, d0 -; CHECK-GI-NEXT: fmov x9, d2 -; CHECK-GI-NEXT: fmov x10, d3 -; CHECK-GI-NEXT: mul x8, x8, x9 -; CHECK-GI-NEXT: fmov x9, d1 -; CHECK-GI-NEXT: mul x9, x9, x10 +; CHECK-GI-NEXT: mov x8, v0.d[1] +; CHECK-GI-NEXT: fmov x10, d0 +; CHECK-GI-NEXT: mov x9, v1.d[1] +; CHECK-GI-NEXT: mul x8, x10, x8 +; CHECK-GI-NEXT: fmov x10, d1 +; CHECK-GI-NEXT: mul x9, x10, x9 ; CHECK-GI-NEXT: mul x0, x8, x9 ; CHECK-GI-NEXT: ret entry: diff --git a/llvm/test/CodeGen/AArch64/aarch64-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-smull.ll index 540471a05901a..307aa397eabbb 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-smull.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-smull.ll @@ -279,17 +279,15 @@ define <2 x i64> @smull_zext_v2i32_v2i64(ptr %A, ptr %B) nounwind { ; CHECK-GI-NEXT: ldr d0, [x1] ; CHECK-GI-NEXT: sshll v0.2d, v0.2s, #0 ; CHECK-GI-NEXT: fmov d1, x8 -; CHECK-GI-NEXT: mov d3, v0.d[1] +; CHECK-GI-NEXT: fmov x11, d0 ; CHECK-GI-NEXT: mov v1.d[1], x9 -; CHECK-GI-NEXT: fmov x9, d0 -; CHECK-GI-NEXT: fmov x10, d3 -; CHECK-GI-NEXT: mov d2, v1.d[1] -; CHECK-GI-NEXT: fmov x8, d1 +; CHECK-GI-NEXT: mov x9, v0.d[1] +; CHECK-GI-NEXT: fmov x10, d1 +; CHECK-GI-NEXT: mov x8, v1.d[1] +; CHECK-GI-NEXT: mul x10, x10, x11 ; CHECK-GI-NEXT: mul x8, x8, x9 -; CHECK-GI-NEXT: fmov x9, d2 -; CHECK-GI-NEXT: mul x9, x9, x10 -; CHECK-GI-NEXT: fmov d0, x8 -; CHECK-GI-NEXT: mov v0.d[1], x9 +; CHECK-GI-NEXT: fmov d0, x10 +; CHECK-GI-NEXT: mov v0.d[1], x8 ; CHECK-GI-NEXT: ret %load.A = load <2 x i16>, ptr %A %load.B = load <2 x i32>, ptr %B @@ -324,16 +322,14 @@ define <2 x i64> @smull_zext_and_v2i32_v2i64(ptr %A, ptr %B) nounwind { ; CHECK-GI-NEXT: ldr d1, [x1] ; CHECK-GI-NEXT: sshll v1.2d, v1.2s, #0 ; CHECK-GI-NEXT: ushll v0.2d, v0.2s, #0 -; CHECK-GI-NEXT: mov d3, v1.d[1] -; CHECK-GI-NEXT: fmov x9, d1 -; CHECK-GI-NEXT: mov d2, v0.d[1] -; CHECK-GI-NEXT: fmov x8, d0 +; CHECK-GI-NEXT: fmov x11, d1 +; CHECK-GI-NEXT: mov x9, v1.d[1] +; CHECK-GI-NEXT: fmov x10, d0 +; CHECK-GI-NEXT: mov x8, v0.d[1] +; CHECK-GI-NEXT: mul x10, x10, x11 ; CHECK-GI-NEXT: mul x8, x8, x9 -; CHECK-GI-NEXT: fmov x10, d3 -; CHECK-GI-NEXT: fmov x9, d2 -; CHECK-GI-NEXT: mul x9, x9, x10 -; CHECK-GI-NEXT: fmov d0, x8 -; CHECK-GI-NEXT: mov v0.d[1], x9 +; CHECK-GI-NEXT: fmov d0, x10 +; CHECK-GI-NEXT: mov v0.d[1], x8 ; CHECK-GI-NEXT: ret %load.A = load <2 x i32>, ptr %A %and.A = and <2 x i32> %load.A, @@ -1052,16 +1048,14 @@ define <2 x i64> @smull_extvec_v2i32_v2i64(<2 x i32> %arg) nounwind { ; CHECK-GI-NEXT: adrp x8, .LCPI36_0 ; CHECK-GI-NEXT: sshll v0.2d, v0.2s, #0 ; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI36_0] -; CHECK-GI-NEXT: mov d2, v0.d[1] -; CHECK-GI-NEXT: mov d3, v1.d[1] -; CHECK-GI-NEXT: fmov x8, d0 -; CHECK-GI-NEXT: fmov x9, d1 +; CHECK-GI-NEXT: fmov x10, d0 +; CHECK-GI-NEXT: fmov x11, d1 +; CHECK-GI-NEXT: mov x8, v0.d[1] +; CHECK-GI-NEXT: mov x9, v1.d[1] +; CHECK-GI-NEXT: mul x10, x10, x11 ; CHECK-GI-NEXT: mul x8, x8, x9 -; CHECK-GI-NEXT: fmov x9, d2 -; CHECK-GI-NEXT: fmov x10, d3 -; CHECK-GI-NEXT: mul x9, x9, x10 -; CHECK-GI-NEXT: fmov d0, x8 -; CHECK-GI-NEXT: mov v0.d[1], x9 +; CHECK-GI-NEXT: fmov d0, x10 +; CHECK-GI-NEXT: mov v0.d[1], x8 ; CHECK-GI-NEXT: ret %tmp3 = sext <2 x i32> %arg to <2 x i64> %tmp4 = mul <2 x i64> %tmp3, @@ -1169,16 +1163,14 @@ define <2 x i64> @umull_extvec_v2i32_v2i64(<2 x i32> %arg) nounwind { ; CHECK-GI-NEXT: adrp x8, .LCPI40_0 ; CHECK-GI-NEXT: ushll v0.2d, v0.2s, #0 ; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI40_0] -; CHECK-GI-NEXT: mov d2, v0.d[1] -; CHECK-GI-NEXT: mov d3, v1.d[1] -; CHECK-GI-NEXT: fmov x8, d0 -; CHECK-GI-NEXT: fmov x9, d1 +; CHECK-GI-NEXT: fmov x10, d0 +; CHECK-GI-NEXT: fmov x11, d1 +; CHECK-GI-NEXT: mov x8, v0.d[1] +; CHECK-GI-NEXT: mov x9, v1.d[1] +; CHECK-GI-NEXT: mul x10, x10, x11 ; CHECK-GI-NEXT: mul x8, x8, x9 -; CHECK-GI-NEXT: fmov x9, d2 -; CHECK-GI-NEXT: fmov x10, d3 -; CHECK-GI-NEXT: mul x9, x9, x10 -; CHECK-GI-NEXT: fmov d0, x8 -; CHECK-GI-NEXT: mov v0.d[1], x9 +; CHECK-GI-NEXT: fmov d0, x10 +; CHECK-GI-NEXT: mov v0.d[1], x8 ; CHECK-GI-NEXT: ret %tmp3 = zext <2 x i32> %arg to <2 x i64> %tmp4 = mul <2 x i64> %tmp3, @@ -1272,17 +1264,15 @@ define <2 x i64> @amull_extvec_v2i32_v2i64(<2 x i32> %arg) nounwind { ; CHECK-GI-NEXT: adrp x8, .LCPI43_0 ; CHECK-GI-NEXT: ushll v0.2d, v0.2s, #0 ; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI43_0] -; CHECK-GI-NEXT: mov d2, v0.d[1] -; CHECK-GI-NEXT: mov d3, v1.d[1] -; CHECK-GI-NEXT: fmov x8, d0 -; CHECK-GI-NEXT: fmov x9, d1 +; CHECK-GI-NEXT: fmov x10, d0 +; CHECK-GI-NEXT: fmov x11, d1 +; CHECK-GI-NEXT: mov x8, v0.d[1] +; CHECK-GI-NEXT: mov x9, v1.d[1] ; CHECK-GI-NEXT: movi v1.2d, #0x000000ffffffff +; CHECK-GI-NEXT: mul x10, x10, x11 ; CHECK-GI-NEXT: mul x8, x8, x9 -; CHECK-GI-NEXT: fmov x9, d2 -; CHECK-GI-NEXT: fmov x10, d3 -; CHECK-GI-NEXT: mul x9, x9, x10 -; CHECK-GI-NEXT: fmov d0, x8 -; CHECK-GI-NEXT: mov v0.d[1], x9 +; CHECK-GI-NEXT: fmov d0, x10 +; CHECK-GI-NEXT: mov v0.d[1], x8 ; CHECK-GI-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-GI-NEXT: ret %tmp3 = zext <2 x i32> %arg to <2 x i64> @@ -1901,17 +1891,15 @@ define <2 x i64> @umull_and_v2i64(<2 x i32> %src1, <2 x i64> %src2) { ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: movi v2.2d, #0x000000000000ff ; CHECK-GI-NEXT: ushll v0.2d, v0.2s, #0 -; CHECK-GI-NEXT: fmov x8, d0 +; CHECK-GI-NEXT: fmov x10, d0 +; CHECK-GI-NEXT: mov x8, v0.d[1] ; CHECK-GI-NEXT: and v1.16b, v1.16b, v2.16b -; CHECK-GI-NEXT: mov d2, v0.d[1] -; CHECK-GI-NEXT: mov d3, v1.d[1] -; CHECK-GI-NEXT: fmov x9, d1 +; CHECK-GI-NEXT: fmov x11, d1 +; CHECK-GI-NEXT: mov x9, v1.d[1] +; CHECK-GI-NEXT: mul x10, x10, x11 ; CHECK-GI-NEXT: mul x8, x8, x9 -; CHECK-GI-NEXT: fmov x9, d2 -; CHECK-GI-NEXT: fmov x10, d3 -; CHECK-GI-NEXT: mul x9, x9, x10 -; CHECK-GI-NEXT: fmov d0, x8 -; CHECK-GI-NEXT: mov v0.d[1], x9 +; CHECK-GI-NEXT: fmov d0, x10 +; CHECK-GI-NEXT: mov v0.d[1], x8 ; CHECK-GI-NEXT: ret entry: %in1 = zext <2 x i32> %src1 to <2 x i64> @@ -1947,26 +1935,22 @@ define <4 x i64> @umull_and_v4i64(<4 x i32> %src1, <4 x i64> %src2) { ; CHECK-GI-NEXT: ushll v4.2d, v0.2s, #0 ; CHECK-GI-NEXT: ushll2 v0.2d, v0.4s, #0 ; CHECK-GI-NEXT: fmov x8, d4 +; CHECK-GI-NEXT: mov x10, v4.d[1] +; CHECK-GI-NEXT: mov x13, v0.d[1] ; CHECK-GI-NEXT: and v1.16b, v1.16b, v3.16b ; CHECK-GI-NEXT: and v2.16b, v2.16b, v3.16b -; CHECK-GI-NEXT: mov d3, v4.d[1] ; CHECK-GI-NEXT: fmov x9, d1 -; CHECK-GI-NEXT: mov d4, v1.d[1] -; CHECK-GI-NEXT: fmov x10, d2 -; CHECK-GI-NEXT: mov d1, v0.d[1] +; CHECK-GI-NEXT: fmov x12, d2 +; CHECK-GI-NEXT: mov x11, v1.d[1] +; CHECK-GI-NEXT: mov x14, v2.d[1] ; CHECK-GI-NEXT: mul x8, x8, x9 ; CHECK-GI-NEXT: fmov x9, d0 -; CHECK-GI-NEXT: mov d0, v2.d[1] -; CHECK-GI-NEXT: fmov x11, d4 -; CHECK-GI-NEXT: mul x9, x9, x10 -; CHECK-GI-NEXT: fmov x10, d3 -; CHECK-GI-NEXT: fmov x12, d0 -; CHECK-GI-NEXT: fmov d0, x8 ; CHECK-GI-NEXT: mul x10, x10, x11 -; CHECK-GI-NEXT: fmov x11, d1 -; CHECK-GI-NEXT: fmov d1, x9 -; CHECK-GI-NEXT: mul x11, x11, x12 +; CHECK-GI-NEXT: mul x9, x9, x12 +; CHECK-GI-NEXT: fmov d0, x8 +; CHECK-GI-NEXT: mul x11, x13, x14 ; CHECK-GI-NEXT: mov v0.d[1], x10 +; CHECK-GI-NEXT: fmov d1, x9 ; CHECK-GI-NEXT: mov v1.d[1], x11 ; CHECK-GI-NEXT: ret entry: @@ -1999,20 +1983,17 @@ define <4 x i64> @umull_and_v4i64_dup(<4 x i32> %src1, i64 %src2) { ; CHECK-GI-NEXT: ushll v1.2d, v0.2s, #0 ; CHECK-GI-NEXT: ushll2 v0.2d, v0.4s, #0 ; CHECK-GI-NEXT: dup v2.2d, x8 -; CHECK-GI-NEXT: mov d3, v1.d[1] ; CHECK-GI-NEXT: fmov x8, d1 -; CHECK-GI-NEXT: fmov x10, d0 -; CHECK-GI-NEXT: mov d1, v2.d[1] +; CHECK-GI-NEXT: fmov x12, d0 +; CHECK-GI-NEXT: mov x10, v1.d[1] ; CHECK-GI-NEXT: fmov x9, d2 -; CHECK-GI-NEXT: mov d2, v0.d[1] +; CHECK-GI-NEXT: mov x11, v2.d[1] +; CHECK-GI-NEXT: mov x13, v0.d[1] ; CHECK-GI-NEXT: mul x8, x8, x9 -; CHECK-GI-NEXT: fmov x11, d1 -; CHECK-GI-NEXT: fmov x12, d2 -; CHECK-GI-NEXT: mul x9, x10, x9 -; CHECK-GI-NEXT: fmov x10, d3 +; CHECK-GI-NEXT: mul x9, x12, x9 ; CHECK-GI-NEXT: mul x10, x10, x11 ; CHECK-GI-NEXT: fmov d0, x8 -; CHECK-GI-NEXT: mul x11, x12, x11 +; CHECK-GI-NEXT: mul x11, x13, x11 ; CHECK-GI-NEXT: fmov d1, x9 ; CHECK-GI-NEXT: mov v0.d[1], x10 ; CHECK-GI-NEXT: mov v1.d[1], x11 diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll b/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll index 749d6071c98d7..43d5ab5ab54e1 100644 --- a/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll +++ b/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll @@ -1488,8 +1488,7 @@ define <4 x i16> @test_dup_v2i32_v4i16(<2 x i32> %a) { ; CHECK-GI-LABEL: test_dup_v2i32_v4i16: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-GI-NEXT: mov s0, v0.s[1] -; CHECK-GI-NEXT: fmov w8, s0 +; CHECK-GI-NEXT: mov w8, v0.s[1] ; CHECK-GI-NEXT: dup v0.4h, w8 ; CHECK-GI-NEXT: ret entry: @@ -1510,8 +1509,7 @@ define <8 x i16> @test_dup_v4i32_v8i16(<4 x i32> %a) { ; ; CHECK-GI-LABEL: test_dup_v4i32_v8i16: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: mov s0, v0.s[3] -; CHECK-GI-NEXT: fmov w8, s0 +; CHECK-GI-NEXT: mov w8, v0.s[3] ; CHECK-GI-NEXT: dup v0.8h, w8 ; CHECK-GI-NEXT: ret entry: @@ -1578,8 +1576,7 @@ define <8 x i16> @test_dup_v2i64_v8i16(<2 x i64> %a) { ; ; CHECK-GI-LABEL: test_dup_v2i64_v8i16: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: mov d0, v0.d[1] -; CHECK-GI-NEXT: fmov x8, d0 +; CHECK-GI-NEXT: mov x8, v0.d[1] ; CHECK-GI-NEXT: dup v0.8h, w8 ; CHECK-GI-NEXT: ret entry: @@ -1626,8 +1623,7 @@ define <4 x i16> @test_dup_v4i32_v4i16(<4 x i32> %a) { ; ; CHECK-GI-LABEL: test_dup_v4i32_v4i16: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: mov s0, v0.s[1] -; CHECK-GI-NEXT: fmov w8, s0 +; CHECK-GI-NEXT: mov w8, v0.s[1] ; CHECK-GI-NEXT: dup v0.4h, w8 ; CHECK-GI-NEXT: ret entry: diff --git a/llvm/test/CodeGen/AArch64/bitcast.ll b/llvm/test/CodeGen/AArch64/bitcast.ll index e0851fd8739ec..5de99586f7fc7 100644 --- a/llvm/test/CodeGen/AArch64/bitcast.ll +++ b/llvm/test/CodeGen/AArch64/bitcast.ll @@ -517,10 +517,8 @@ define <4 x i64> @bitcast_v8i32_v4i64(<8 x i32> %a, <8 x i32> %b){ ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: add v0.4s, v0.4s, v2.4s ; CHECK-GI-NEXT: add v1.4s, v1.4s, v3.4s -; CHECK-GI-NEXT: mov d2, v0.d[1] -; CHECK-GI-NEXT: mov d3, v1.d[1] -; CHECK-GI-NEXT: fmov x8, d2 -; CHECK-GI-NEXT: fmov x9, d3 +; CHECK-GI-NEXT: mov x8, v0.d[1] +; CHECK-GI-NEXT: mov x9, v1.d[1] ; CHECK-GI-NEXT: mov v0.d[1], x8 ; CHECK-GI-NEXT: mov v1.d[1], x9 ; CHECK-GI-NEXT: ret @@ -578,10 +576,8 @@ define <4 x i64> @bitcast_v16i16_v4i64(<16 x i16> %a, <16 x i16> %b){ ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: add v0.8h, v0.8h, v2.8h ; CHECK-GI-NEXT: add v1.8h, v1.8h, v3.8h -; CHECK-GI-NEXT: mov d2, v0.d[1] -; CHECK-GI-NEXT: mov d3, v1.d[1] -; CHECK-GI-NEXT: fmov x8, d2 -; CHECK-GI-NEXT: fmov x9, d3 +; CHECK-GI-NEXT: mov x8, v0.d[1] +; CHECK-GI-NEXT: mov x9, v1.d[1] ; CHECK-GI-NEXT: mov v0.d[1], x8 ; CHECK-GI-NEXT: mov v1.d[1], x9 ; CHECK-GI-NEXT: ret @@ -622,14 +618,10 @@ define <8 x i64> @bitcast_v16i32_v8i64(<16 x i32> %a, <16 x i32> %b){ ; CHECK-GI-NEXT: add v1.4s, v1.4s, v5.4s ; CHECK-GI-NEXT: add v2.4s, v2.4s, v6.4s ; CHECK-GI-NEXT: add v3.4s, v3.4s, v7.4s -; CHECK-GI-NEXT: mov d4, v0.d[1] -; CHECK-GI-NEXT: mov d5, v1.d[1] -; CHECK-GI-NEXT: mov d6, v2.d[1] -; CHECK-GI-NEXT: mov d7, v3.d[1] -; CHECK-GI-NEXT: fmov x8, d4 -; CHECK-GI-NEXT: fmov x9, d5 -; CHECK-GI-NEXT: fmov x10, d6 -; CHECK-GI-NEXT: fmov x11, d7 +; CHECK-GI-NEXT: mov x8, v0.d[1] +; CHECK-GI-NEXT: mov x9, v1.d[1] +; CHECK-GI-NEXT: mov x10, v2.d[1] +; CHECK-GI-NEXT: mov x11, v3.d[1] ; CHECK-GI-NEXT: mov v0.d[1], x8 ; CHECK-GI-NEXT: mov v1.d[1], x9 ; CHECK-GI-NEXT: mov v2.d[1], x10 diff --git a/llvm/test/CodeGen/AArch64/insertextract.ll b/llvm/test/CodeGen/AArch64/insertextract.ll index c6b2d07231bf8..8b82004388b09 100644 --- a/llvm/test/CodeGen/AArch64/insertextract.ll +++ b/llvm/test/CodeGen/AArch64/insertextract.ll @@ -983,13 +983,12 @@ define <3 x i32> @insert_v3i32_0(<3 x i32> %a, i32 %b, i32 %c) { ; ; CHECK-GI-LABEL: insert_v3i32_0: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: mov s1, v0.s[1] -; CHECK-GI-NEXT: mov s2, v0.s[2] -; CHECK-GI-NEXT: fmov s0, w0 -; CHECK-GI-NEXT: fmov w8, s1 -; CHECK-GI-NEXT: mov v0.s[1], w8 -; CHECK-GI-NEXT: fmov w8, s2 -; CHECK-GI-NEXT: mov v0.s[2], w8 +; CHECK-GI-NEXT: mov w8, v0.s[1] +; CHECK-GI-NEXT: fmov s1, w0 +; CHECK-GI-NEXT: mov w9, v0.s[2] +; CHECK-GI-NEXT: mov v1.s[1], w8 +; CHECK-GI-NEXT: mov v1.s[2], w9 +; CHECK-GI-NEXT: mov v0.16b, v1.16b ; CHECK-GI-NEXT: ret entry: %d = insertelement <3 x i32> %a, i32 %b, i32 0 diff --git a/llvm/test/CodeGen/AArch64/ptradd.ll b/llvm/test/CodeGen/AArch64/ptradd.ll index 107db8723c646..af283f6a093e9 100644 --- a/llvm/test/CodeGen/AArch64/ptradd.ll +++ b/llvm/test/CodeGen/AArch64/ptradd.ll @@ -81,13 +81,12 @@ define void @vector_gep_v3i32(<3 x ptr> %b, <3 x i32> %off, ptr %p) { ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 ; CHECK-GI-NEXT: smov x9, v3.s[1] -; CHECK-GI-NEXT: mov s3, v3.s[2] ; CHECK-GI-NEXT: mov v0.d[1], v1.d[0] ; CHECK-GI-NEXT: fmov d1, x8 -; CHECK-GI-NEXT: fmov x8, d2 +; CHECK-GI-NEXT: mov w8, v3.s[2] ; CHECK-GI-NEXT: mov v1.d[1], x9 -; CHECK-GI-NEXT: fmov w9, s3 -; CHECK-GI-NEXT: add x8, x8, w9, sxtw +; CHECK-GI-NEXT: fmov x9, d2 +; CHECK-GI-NEXT: add x8, x9, w8, sxtw ; CHECK-GI-NEXT: add v0.2d, v0.2d, v1.2d ; CHECK-GI-NEXT: str x8, [x0, #16] ; CHECK-GI-NEXT: str q0, [x0] diff --git a/llvm/test/CodeGen/AArch64/reduce-and.ll b/llvm/test/CodeGen/AArch64/reduce-and.ll index 62ad45b212967..8ca521327c2e3 100644 --- a/llvm/test/CodeGen/AArch64/reduce-and.ll +++ b/llvm/test/CodeGen/AArch64/reduce-and.ll @@ -30,10 +30,9 @@ define i1 @test_redand_v2i1(<2 x i1> %a) { ; GISEL-LABEL: test_redand_v2i1: ; GISEL: // %bb.0: ; GISEL-NEXT: // kill: def $d0 killed $d0 def $q0 -; GISEL-NEXT: mov s1, v0.s[1] -; GISEL-NEXT: fmov w8, s0 -; GISEL-NEXT: fmov w9, s1 -; GISEL-NEXT: and w8, w8, w9 +; GISEL-NEXT: mov w8, v0.s[1] +; GISEL-NEXT: fmov w9, s0 +; GISEL-NEXT: and w8, w9, w8 ; GISEL-NEXT: and w0, w8, #0x1 ; GISEL-NEXT: ret %or_result = call i1 @llvm.vector.reduce.and.v2i1(<2 x i1> %a) @@ -457,10 +456,9 @@ define i32 @test_redand_v2i32(<2 x i32> %a) { ; GISEL-LABEL: test_redand_v2i32: ; GISEL: // %bb.0: ; GISEL-NEXT: // kill: def $d0 killed $d0 def $q0 -; GISEL-NEXT: mov s1, v0.s[1] -; GISEL-NEXT: fmov w8, s0 -; GISEL-NEXT: fmov w9, s1 -; GISEL-NEXT: and w0, w8, w9 +; GISEL-NEXT: mov w8, v0.s[1] +; GISEL-NEXT: fmov w9, s0 +; GISEL-NEXT: and w0, w9, w8 ; GISEL-NEXT: ret %and_result = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> %a) ret i32 %and_result @@ -480,10 +478,9 @@ define i32 @test_redand_v4i32(<4 x i32> %a) { ; GISEL: // %bb.0: ; GISEL-NEXT: mov d1, v0.d[1] ; GISEL-NEXT: and v0.8b, v0.8b, v1.8b -; GISEL-NEXT: mov s1, v0.s[1] -; GISEL-NEXT: fmov w8, s0 -; GISEL-NEXT: fmov w9, s1 -; GISEL-NEXT: and w0, w8, w9 +; GISEL-NEXT: mov w8, v0.s[1] +; GISEL-NEXT: fmov w9, s0 +; GISEL-NEXT: and w0, w9, w8 ; GISEL-NEXT: ret %and_result = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> %a) ret i32 %and_result @@ -505,10 +502,9 @@ define i32 @test_redand_v8i32(<8 x i32> %a) { ; GISEL-NEXT: and v0.16b, v0.16b, v1.16b ; GISEL-NEXT: mov d1, v0.d[1] ; GISEL-NEXT: and v0.8b, v0.8b, v1.8b -; GISEL-NEXT: mov s1, v0.s[1] -; GISEL-NEXT: fmov w8, s0 -; GISEL-NEXT: fmov w9, s1 -; GISEL-NEXT: and w0, w8, w9 +; GISEL-NEXT: mov w8, v0.s[1] +; GISEL-NEXT: fmov w9, s0 +; GISEL-NEXT: and w0, w9, w8 ; GISEL-NEXT: ret %and_result = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> %a) ret i32 %and_result @@ -524,10 +520,9 @@ define i64 @test_redand_v2i64(<2 x i64> %a) { ; ; GISEL-LABEL: test_redand_v2i64: ; GISEL: // %bb.0: -; GISEL-NEXT: mov d1, v0.d[1] -; GISEL-NEXT: fmov x8, d0 -; GISEL-NEXT: fmov x9, d1 -; GISEL-NEXT: and x0, x8, x9 +; GISEL-NEXT: mov x8, v0.d[1] +; GISEL-NEXT: fmov x9, d0 +; GISEL-NEXT: and x0, x9, x8 ; GISEL-NEXT: ret %and_result = call i64 @llvm.vector.reduce.and.v2i64(<2 x i64> %a) ret i64 %and_result @@ -545,10 +540,9 @@ define i64 @test_redand_v4i64(<4 x i64> %a) { ; GISEL-LABEL: test_redand_v4i64: ; GISEL: // %bb.0: ; GISEL-NEXT: and v0.16b, v0.16b, v1.16b -; GISEL-NEXT: mov d1, v0.d[1] -; GISEL-NEXT: fmov x8, d0 -; GISEL-NEXT: fmov x9, d1 -; GISEL-NEXT: and x0, x8, x9 +; GISEL-NEXT: mov x8, v0.d[1] +; GISEL-NEXT: fmov x9, d0 +; GISEL-NEXT: and x0, x9, x8 ; GISEL-NEXT: ret %and_result = call i64 @llvm.vector.reduce.and.v4i64(<4 x i64> %a) ret i64 %and_result diff --git a/llvm/test/CodeGen/AArch64/reduce-or.ll b/llvm/test/CodeGen/AArch64/reduce-or.ll index 20c498d36fdea..aac31ce8b71b7 100644 --- a/llvm/test/CodeGen/AArch64/reduce-or.ll +++ b/llvm/test/CodeGen/AArch64/reduce-or.ll @@ -30,10 +30,9 @@ define i1 @test_redor_v2i1(<2 x i1> %a) { ; GISEL-LABEL: test_redor_v2i1: ; GISEL: // %bb.0: ; GISEL-NEXT: // kill: def $d0 killed $d0 def $q0 -; GISEL-NEXT: mov s1, v0.s[1] -; GISEL-NEXT: fmov w8, s0 -; GISEL-NEXT: fmov w9, s1 -; GISEL-NEXT: orr w8, w8, w9 +; GISEL-NEXT: mov w8, v0.s[1] +; GISEL-NEXT: fmov w9, s0 +; GISEL-NEXT: orr w8, w9, w8 ; GISEL-NEXT: and w0, w8, #0x1 ; GISEL-NEXT: ret %or_result = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> %a) @@ -459,10 +458,9 @@ define i32 @test_redor_v2i32(<2 x i32> %a) { ; GISEL-LABEL: test_redor_v2i32: ; GISEL: // %bb.0: ; GISEL-NEXT: // kill: def $d0 killed $d0 def $q0 -; GISEL-NEXT: mov s1, v0.s[1] -; GISEL-NEXT: fmov w8, s0 -; GISEL-NEXT: fmov w9, s1 -; GISEL-NEXT: orr w0, w8, w9 +; GISEL-NEXT: mov w8, v0.s[1] +; GISEL-NEXT: fmov w9, s0 +; GISEL-NEXT: orr w0, w9, w8 ; GISEL-NEXT: ret %or_result = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> %a) ret i32 %or_result @@ -482,10 +480,9 @@ define i32 @test_redor_v4i32(<4 x i32> %a) { ; GISEL: // %bb.0: ; GISEL-NEXT: mov d1, v0.d[1] ; GISEL-NEXT: orr v0.8b, v0.8b, v1.8b -; GISEL-NEXT: mov s1, v0.s[1] -; GISEL-NEXT: fmov w8, s0 -; GISEL-NEXT: fmov w9, s1 -; GISEL-NEXT: orr w0, w8, w9 +; GISEL-NEXT: mov w8, v0.s[1] +; GISEL-NEXT: fmov w9, s0 +; GISEL-NEXT: orr w0, w9, w8 ; GISEL-NEXT: ret %or_result = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %a) ret i32 %or_result @@ -507,10 +504,9 @@ define i32 @test_redor_v8i32(<8 x i32> %a) { ; GISEL-NEXT: orr v0.16b, v0.16b, v1.16b ; GISEL-NEXT: mov d1, v0.d[1] ; GISEL-NEXT: orr v0.8b, v0.8b, v1.8b -; GISEL-NEXT: mov s1, v0.s[1] -; GISEL-NEXT: fmov w8, s0 -; GISEL-NEXT: fmov w9, s1 -; GISEL-NEXT: orr w0, w8, w9 +; GISEL-NEXT: mov w8, v0.s[1] +; GISEL-NEXT: fmov w9, s0 +; GISEL-NEXT: orr w0, w9, w8 ; GISEL-NEXT: ret %or_result = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> %a) ret i32 %or_result @@ -526,10 +522,9 @@ define i64 @test_redor_v2i64(<2 x i64> %a) { ; ; GISEL-LABEL: test_redor_v2i64: ; GISEL: // %bb.0: -; GISEL-NEXT: mov d1, v0.d[1] -; GISEL-NEXT: fmov x8, d0 -; GISEL-NEXT: fmov x9, d1 -; GISEL-NEXT: orr x0, x8, x9 +; GISEL-NEXT: mov x8, v0.d[1] +; GISEL-NEXT: fmov x9, d0 +; GISEL-NEXT: orr x0, x9, x8 ; GISEL-NEXT: ret %or_result = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> %a) ret i64 %or_result @@ -547,10 +542,9 @@ define i64 @test_redor_v4i64(<4 x i64> %a) { ; GISEL-LABEL: test_redor_v4i64: ; GISEL: // %bb.0: ; GISEL-NEXT: orr v0.16b, v0.16b, v1.16b -; GISEL-NEXT: mov d1, v0.d[1] -; GISEL-NEXT: fmov x8, d0 -; GISEL-NEXT: fmov x9, d1 -; GISEL-NEXT: orr x0, x8, x9 +; GISEL-NEXT: mov x8, v0.d[1] +; GISEL-NEXT: fmov x9, d0 +; GISEL-NEXT: orr x0, x9, x8 ; GISEL-NEXT: ret %or_result = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> %a) ret i64 %or_result diff --git a/llvm/test/CodeGen/AArch64/reduce-xor.ll b/llvm/test/CodeGen/AArch64/reduce-xor.ll index b8ca99e003b62..9a00172f94763 100644 --- a/llvm/test/CodeGen/AArch64/reduce-xor.ll +++ b/llvm/test/CodeGen/AArch64/reduce-xor.ll @@ -27,10 +27,9 @@ define i1 @test_redxor_v2i1(<2 x i1> %a) { ; GISEL-LABEL: test_redxor_v2i1: ; GISEL: // %bb.0: ; GISEL-NEXT: // kill: def $d0 killed $d0 def $q0 -; GISEL-NEXT: mov s1, v0.s[1] -; GISEL-NEXT: fmov w8, s0 -; GISEL-NEXT: fmov w9, s1 -; GISEL-NEXT: eor w8, w8, w9 +; GISEL-NEXT: mov w8, v0.s[1] +; GISEL-NEXT: fmov w9, s0 +; GISEL-NEXT: eor w8, w9, w8 ; GISEL-NEXT: and w0, w8, #0x1 ; GISEL-NEXT: ret %or_result = call i1 @llvm.vector.reduce.xor.v2i1(<2 x i1> %a) @@ -448,10 +447,9 @@ define i32 @test_redxor_v2i32(<2 x i32> %a) { ; GISEL-LABEL: test_redxor_v2i32: ; GISEL: // %bb.0: ; GISEL-NEXT: // kill: def $d0 killed $d0 def $q0 -; GISEL-NEXT: mov s1, v0.s[1] -; GISEL-NEXT: fmov w8, s0 -; GISEL-NEXT: fmov w9, s1 -; GISEL-NEXT: eor w0, w8, w9 +; GISEL-NEXT: mov w8, v0.s[1] +; GISEL-NEXT: fmov w9, s0 +; GISEL-NEXT: eor w0, w9, w8 ; GISEL-NEXT: ret %xor_result = call i32 @llvm.vector.reduce.xor.v2i32(<2 x i32> %a) ret i32 %xor_result @@ -471,10 +469,9 @@ define i32 @test_redxor_v4i32(<4 x i32> %a) { ; GISEL: // %bb.0: ; GISEL-NEXT: mov d1, v0.d[1] ; GISEL-NEXT: eor v0.8b, v0.8b, v1.8b -; GISEL-NEXT: mov s1, v0.s[1] -; GISEL-NEXT: fmov w8, s0 -; GISEL-NEXT: fmov w9, s1 -; GISEL-NEXT: eor w0, w8, w9 +; GISEL-NEXT: mov w8, v0.s[1] +; GISEL-NEXT: fmov w9, s0 +; GISEL-NEXT: eor w0, w9, w8 ; GISEL-NEXT: ret %xor_result = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> %a) ret i32 %xor_result @@ -496,10 +493,9 @@ define i32 @test_redxor_v8i32(<8 x i32> %a) { ; GISEL-NEXT: eor v0.16b, v0.16b, v1.16b ; GISEL-NEXT: mov d1, v0.d[1] ; GISEL-NEXT: eor v0.8b, v0.8b, v1.8b -; GISEL-NEXT: mov s1, v0.s[1] -; GISEL-NEXT: fmov w8, s0 -; GISEL-NEXT: fmov w9, s1 -; GISEL-NEXT: eor w0, w8, w9 +; GISEL-NEXT: mov w8, v0.s[1] +; GISEL-NEXT: fmov w9, s0 +; GISEL-NEXT: eor w0, w9, w8 ; GISEL-NEXT: ret %xor_result = call i32 @llvm.vector.reduce.xor.v8i32(<8 x i32> %a) ret i32 %xor_result @@ -515,10 +511,9 @@ define i64 @test_redxor_v2i64(<2 x i64> %a) { ; ; GISEL-LABEL: test_redxor_v2i64: ; GISEL: // %bb.0: -; GISEL-NEXT: mov d1, v0.d[1] -; GISEL-NEXT: fmov x8, d0 -; GISEL-NEXT: fmov x9, d1 -; GISEL-NEXT: eor x0, x8, x9 +; GISEL-NEXT: mov x8, v0.d[1] +; GISEL-NEXT: fmov x9, d0 +; GISEL-NEXT: eor x0, x9, x8 ; GISEL-NEXT: ret %xor_result = call i64 @llvm.vector.reduce.xor.v2i64(<2 x i64> %a) ret i64 %xor_result @@ -536,10 +531,9 @@ define i64 @test_redxor_v4i64(<4 x i64> %a) { ; GISEL-LABEL: test_redxor_v4i64: ; GISEL: // %bb.0: ; GISEL-NEXT: eor v0.16b, v0.16b, v1.16b -; GISEL-NEXT: mov d1, v0.d[1] -; GISEL-NEXT: fmov x8, d0 -; GISEL-NEXT: fmov x9, d1 -; GISEL-NEXT: eor x0, x8, x9 +; GISEL-NEXT: mov x8, v0.d[1] +; GISEL-NEXT: fmov x9, d0 +; GISEL-NEXT: eor x0, x9, x8 ; GISEL-NEXT: ret %xor_result = call i64 @llvm.vector.reduce.xor.v4i64(<4 x i64> %a) ret i64 %xor_result