diff --git a/llvm/include/llvm/CodeGen/ComplexDeinterleavingPass.h b/llvm/include/llvm/CodeGen/ComplexDeinterleavingPass.h index 99df6e5ad1d71..3d11bf3651a36 100644 --- a/llvm/include/llvm/CodeGen/ComplexDeinterleavingPass.h +++ b/llvm/include/llvm/CodeGen/ComplexDeinterleavingPass.h @@ -38,7 +38,8 @@ enum class ComplexDeinterleavingOperation { CMulPartial, // The following 'operations' are used to represent internal states. Backends // are not expected to try and support these in any capacity. - Shuffle + Shuffle, + Symmetric }; enum class ComplexDeinterleavingRotation { diff --git a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp index 3945913040fc6..fcc25d900e6ed 100644 --- a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp +++ b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp @@ -254,6 +254,7 @@ class ComplexDeinterleavingGraph { /// 270: r: ar + bi /// i: ai - br NodePtr identifyAdd(Instruction *Real, Instruction *Imag); + NodePtr identifySymmetricOperation(Instruction *Real, Instruction *Imag); NodePtr identifyNode(Instruction *I, Instruction *J); @@ -702,6 +703,59 @@ static bool isInstructionPairMul(Instruction *A, Instruction *B) { return match(A, Pattern) && match(B, Pattern); } +static bool isInstructionPotentiallySymmetric(Instruction *I) { + switch (I->getOpcode()) { + case Instruction::FAdd: + case Instruction::FSub: + case Instruction::FMul: + case Instruction::FNeg: + return true; + default: + return false; + } +} + +ComplexDeinterleavingGraph::NodePtr +ComplexDeinterleavingGraph::identifySymmetricOperation(Instruction *Real, + Instruction *Imag) { + if (Real->getOpcode() != Imag->getOpcode()) + return nullptr; + + if (!isInstructionPotentiallySymmetric(Real) || + !isInstructionPotentiallySymmetric(Imag)) + return nullptr; + + auto *R0 = dyn_cast(Real->getOperand(0)); + auto *I0 = dyn_cast(Imag->getOperand(0)); + + if (!R0 || !I0) + return nullptr; + + NodePtr Op0 = identifyNode(R0, I0); + NodePtr Op1 = nullptr; + if (Op0 == nullptr) + return nullptr; + + if (Real->isBinaryOp()) { + auto *R1 = dyn_cast(Real->getOperand(1)); + auto *I1 = dyn_cast(Imag->getOperand(1)); + if (!R1 || !I1) + return nullptr; + + Op1 = identifyNode(R1, I1); + if (Op1 == nullptr) + return nullptr; + } + + auto Node = prepareCompositeNode(ComplexDeinterleavingOperation::Symmetric, + Real, Imag); + Node->addOperand(Op0); + if (Real->isBinaryOp()) + Node->addOperand(Op1); + + return submitCompositeNode(Node); +} + ComplexDeinterleavingGraph::NodePtr ComplexDeinterleavingGraph::identifyNode(Instruction *Real, Instruction *Imag) { LLVM_DEBUG(dbgs() << "identifyNode on " << *Real << " / " << *Imag << "\n"); @@ -815,7 +869,10 @@ ComplexDeinterleavingGraph::identifyNode(Instruction *Real, Instruction *Imag) { return identifyAdd(Real, Imag); } - return nullptr; + auto Symmetric = identifySymmetricOperation(Real, Imag); + LLVM_DEBUG(if (Symmetric == nullptr) dbgs() + << " - Not recognised as a valid pattern.\n"); + return Symmetric; } bool ComplexDeinterleavingGraph::identifyNodes(Instruction *RootI) { @@ -847,21 +904,53 @@ bool ComplexDeinterleavingGraph::identifyNodes(Instruction *RootI) { return RootNode != nullptr; } +static Value *replaceSymmetricNode(ComplexDeinterleavingGraph::RawNodePtr Node, + Value *InputA, Value *InputB) { + Instruction *I = Node->Real; + if (I->isUnaryOp()) + assert(!InputB && + "Unary symmetric operations need one input, but two were provided."); + else if (I->isBinaryOp()) + assert(InputB && "Binary symmetric operations need two inputs, only one " + "was provided."); + + IRBuilder<> B(I); + + switch (I->getOpcode()) { + case Instruction::FNeg: + return B.CreateFNeg(InputA); + case Instruction::FAdd: + return B.CreateFAdd(InputA, InputB); + case Instruction::FSub: + return B.CreateFSub(InputA, InputB); + case Instruction::FMul: + return B.CreateFMul(InputA, InputB); + } + + return nullptr; +} + Value *ComplexDeinterleavingGraph::replaceNode( ComplexDeinterleavingGraph::RawNodePtr Node) { if (Node->ReplacementNode) return Node->ReplacementNode; Value *Input0 = replaceNode(Node->Operands[0]); - Value *Input1 = replaceNode(Node->Operands[1]); + Value *Input1 = + Node->Operands.size() > 1 ? replaceNode(Node->Operands[1]) : nullptr; Value *Accumulator = Node->Operands.size() > 2 ? replaceNode(Node->Operands[2]) : nullptr; - assert(Input0->getType() == Input1->getType() && - "Node inputs need to be of the same type"); + if (Input1) + assert(Input0->getType() == Input1->getType() && + "Node inputs need to be of the same type"); - Node->ReplacementNode = TL->createComplexDeinterleavingIR( - Node->Real, Node->Operation, Node->Rotation, Input0, Input1, Accumulator); + if (Node->Operation == ComplexDeinterleavingOperation::Symmetric) + Node->ReplacementNode = replaceSymmetricNode(Node, Input0, Input1); + else + Node->ReplacementNode = TL->createComplexDeinterleavingIR( + Node->Real, Node->Operation, Node->Rotation, Input0, Input1, + Accumulator); assert(Node->ReplacementNode && "Target failed to create Intrinsic call."); NumComplexTransformations += 1; diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-mixed-cases.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-mixed-cases.ll index d8b30fd9776e8..fd989180ae152 100644 --- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-mixed-cases.ll +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-mixed-cases.ll @@ -353,3 +353,155 @@ entry: %interleaved.vec = shufflevector <2 x float> %11, <2 x float> %8, <4 x i32> ret <4 x float> %interleaved.vec } + +; Expected to transform +define <4 x float> @mul_addequal(<4 x float> %a, <4 x float> %b, <4 x float> %c) { +; CHECK-LABEL: mul_addequal: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movi v3.2d, #0000000000000000 +; CHECK-NEXT: fcmla v3.4s, v0.4s, v1.4s, #0 +; CHECK-NEXT: fcmla v3.4s, v0.4s, v1.4s, #90 +; CHECK-NEXT: fadd v0.4s, v3.4s, v2.4s +; CHECK-NEXT: ret +entry: + %strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> + %a.imag = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> + %b.real = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> + %b.imag = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> + %0 = fmul fast <2 x float> %b.imag, %strided.vec + %1 = fmul fast <2 x float> %b.real, %a.imag + %2 = fadd fast <2 x float> %1, %0 + %3 = fmul fast <2 x float> %b.real, %strided.vec + %4 = fmul fast <2 x float> %a.imag, %b.imag + %5 = fsub fast <2 x float> %3, %4 + %c.real = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> + %c.imag = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> + %6 = fadd fast <2 x float> %5, %c.real + %7 = fadd fast <2 x float> %2, %c.imag + %interleaved.vec = shufflevector <2 x float> %6, <2 x float> %7, <4 x i32> + ret <4 x float> %interleaved.vec +} + +; Expected to transform +define <4 x float> @mul_subequal(<4 x float> %a, <4 x float> %b, <4 x float> %c) { +; CHECK-LABEL: mul_subequal: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movi v3.2d, #0000000000000000 +; CHECK-NEXT: fcmla v3.4s, v0.4s, v1.4s, #0 +; CHECK-NEXT: fcmla v3.4s, v0.4s, v1.4s, #90 +; CHECK-NEXT: fsub v0.4s, v3.4s, v2.4s +; CHECK-NEXT: ret +entry: + %strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> + %a.imag = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> + %b.real = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> + %b.imag = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> + %0 = fmul fast <2 x float> %b.imag, %strided.vec + %1 = fmul fast <2 x float> %b.real, %a.imag + %2 = fadd fast <2 x float> %1, %0 + %3 = fmul fast <2 x float> %b.real, %strided.vec + %4 = fmul fast <2 x float> %a.imag, %b.imag + %5 = fsub fast <2 x float> %3, %4 + %c.real = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> + %c.imag = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> + %6 = fsub fast <2 x float> %5, %c.real + %7 = fsub fast <2 x float> %2, %c.imag + %interleaved.vec = shufflevector <2 x float> %6, <2 x float> %7, <4 x i32> + ret <4 x float> %interleaved.vec +} + + +; Expected to transform +define <4 x float> @mul_mulequal(<4 x float> %a, <4 x float> %b, <4 x float> %c) { +; CHECK-LABEL: mul_mulequal: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movi v3.2d, #0000000000000000 +; CHECK-NEXT: fcmla v3.4s, v0.4s, v1.4s, #0 +; CHECK-NEXT: fcmla v3.4s, v0.4s, v1.4s, #90 +; CHECK-NEXT: fmul v0.4s, v3.4s, v2.4s +; CHECK-NEXT: ret +entry: + %strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> + %a.imag = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> + %b.real = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> + %b.imag = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> + %0 = fmul fast <2 x float> %b.imag, %strided.vec + %1 = fmul fast <2 x float> %b.real, %a.imag + %2 = fadd fast <2 x float> %1, %0 + %3 = fmul fast <2 x float> %b.real, %strided.vec + %4 = fmul fast <2 x float> %a.imag, %b.imag + %5 = fsub fast <2 x float> %3, %4 + %c.real = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> + %c.imag = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> + %6 = fmul fast <2 x float> %5, %c.real + %7 = fmul fast <2 x float> %2, %c.imag + %interleaved.vec = shufflevector <2 x float> %6, <2 x float> %7, <4 x i32> + ret <4 x float> %interleaved.vec +} + +; Expected to not transform +define <4 x float> @mul_divequal(<4 x float> %a, <4 x float> %b, <4 x float> %c) { +; CHECK-LABEL: mul_divequal: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ext v3.16b, v1.16b, v1.16b, #8 +; CHECK-NEXT: ext v4.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: ext v16.16b, v2.16b, v2.16b, #8 +; CHECK-NEXT: zip2 v5.2s, v1.2s, v3.2s +; CHECK-NEXT: zip1 v1.2s, v1.2s, v3.2s +; CHECK-NEXT: zip2 v6.2s, v0.2s, v4.2s +; CHECK-NEXT: zip1 v0.2s, v0.2s, v4.2s +; CHECK-NEXT: zip1 v4.2s, v2.2s, v16.2s +; CHECK-NEXT: zip2 v2.2s, v2.2s, v16.2s +; CHECK-NEXT: fmul v7.2s, v6.2s, v5.2s +; CHECK-NEXT: fneg v3.2s, v7.2s +; CHECK-NEXT: fmla v3.2s, v0.2s, v1.2s +; CHECK-NEXT: fmul v0.2s, v5.2s, v0.2s +; CHECK-NEXT: fmla v0.2s, v6.2s, v1.2s +; CHECK-NEXT: fdiv v3.2s, v3.2s, v4.2s +; CHECK-NEXT: fdiv v0.2s, v0.2s, v2.2s +; CHECK-NEXT: zip1 v0.4s, v3.4s, v0.4s +; CHECK-NEXT: ret +entry: + %strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> + %a.imag = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> + %b.real = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> + %b.imag = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> + %0 = fmul fast <2 x float> %b.imag, %strided.vec + %1 = fmul fast <2 x float> %b.real, %a.imag + %2 = fadd fast <2 x float> %1, %0 + %3 = fmul fast <2 x float> %b.real, %strided.vec + %4 = fmul fast <2 x float> %a.imag, %b.imag + %5 = fsub fast <2 x float> %3, %4 + %c.real = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> + %c.imag = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> + %6 = fdiv fast <2 x float> %5, %c.real + %7 = fdiv fast <2 x float> %2, %c.imag + %interleaved.vec = shufflevector <2 x float> %6, <2 x float> %7, <4 x i32> + ret <4 x float> %interleaved.vec +} + +; Expected to transform +define <4 x float> @mul_negequal(<4 x float> %a, <4 x float> %b) { +; CHECK-LABEL: mul_negequal: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movi v2.2d, #0000000000000000 +; CHECK-NEXT: fcmla v2.4s, v0.4s, v1.4s, #0 +; CHECK-NEXT: fcmla v2.4s, v0.4s, v1.4s, #90 +; CHECK-NEXT: fneg v0.4s, v2.4s +; CHECK-NEXT: ret +entry: + %strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> + %a.imag = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> + %b.real = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> + %b.imag = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> + %0 = fmul fast <2 x float> %b.imag, %strided.vec + %1 = fmul fast <2 x float> %b.real, %a.imag + %2 = fadd fast <2 x float> %1, %0 + %3 = fmul fast <2 x float> %b.real, %strided.vec + %4 = fmul fast <2 x float> %a.imag, %b.imag + %5 = fsub fast <2 x float> %3, %4 + %6 = fneg fast <2 x float> %5 + %7 = fneg fast <2 x float> %2 + %interleaved.vec = shufflevector <2 x float> %6, <2 x float> %7, <4 x i32> + ret <4 x float> %interleaved.vec +} diff --git a/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-mixed-cases.ll b/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-mixed-cases.ll index b16b06bd45023..a529aa81467e0 100644 --- a/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-mixed-cases.ll +++ b/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-mixed-cases.ll @@ -385,3 +385,194 @@ entry: %interleaved.vec = shufflevector <2 x float> %11, <2 x float> %8, <4 x i32> ret <4 x float> %interleaved.vec } + +; Expected to transform +define <4 x float> @mul_addequal(<4 x float> %a, <4 x float> %b, <4 x float> %c) { +; CHECK-LABEL: mul_addequal: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov d0, r0, r1 +; CHECK-NEXT: mov r1, sp +; CHECK-NEXT: vldrw.u32 q2, [r1] +; CHECK-NEXT: vmov d1, r2, r3 +; CHECK-NEXT: add r0, sp, #16 +; CHECK-NEXT: vcmul.f32 q3, q0, q2, #0 +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vcmla.f32 q3, q0, q2, #90 +; CHECK-NEXT: vadd.f32 q0, q3, q1 +; CHECK-NEXT: vmov r0, r1, d0 +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: bx lr +entry: + %strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> + %a.imag = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> + %b.real = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> + %b.imag = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> + %0 = fmul fast <2 x float> %b.imag, %strided.vec + %1 = fmul fast <2 x float> %b.real, %a.imag + %2 = fadd fast <2 x float> %1, %0 + %3 = fmul fast <2 x float> %b.real, %strided.vec + %4 = fmul fast <2 x float> %a.imag, %b.imag + %5 = fsub fast <2 x float> %3, %4 + %c.real = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> + %c.imag = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> + %6 = fadd fast <2 x float> %5, %c.real + %7 = fadd fast <2 x float> %2, %c.imag + %interleaved.vec = shufflevector <2 x float> %6, <2 x float> %7, <4 x i32> + ret <4 x float> %interleaved.vec +} + +; Expected to transform +define <4 x float> @mul_subequal(<4 x float> %a, <4 x float> %b, <4 x float> %c) { +; CHECK-LABEL: mul_subequal: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov d0, r0, r1 +; CHECK-NEXT: mov r1, sp +; CHECK-NEXT: vldrw.u32 q2, [r1] +; CHECK-NEXT: vmov d1, r2, r3 +; CHECK-NEXT: add r0, sp, #16 +; CHECK-NEXT: vcmul.f32 q3, q0, q2, #0 +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vcmla.f32 q3, q0, q2, #90 +; CHECK-NEXT: vsub.f32 q0, q3, q1 +; CHECK-NEXT: vmov r0, r1, d0 +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: bx lr +entry: + %strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> + %a.imag = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> + %b.real = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> + %b.imag = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> + %0 = fmul fast <2 x float> %b.imag, %strided.vec + %1 = fmul fast <2 x float> %b.real, %a.imag + %2 = fadd fast <2 x float> %1, %0 + %3 = fmul fast <2 x float> %b.real, %strided.vec + %4 = fmul fast <2 x float> %a.imag, %b.imag + %5 = fsub fast <2 x float> %3, %4 + %c.real = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> + %c.imag = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> + %6 = fsub fast <2 x float> %5, %c.real + %7 = fsub fast <2 x float> %2, %c.imag + %interleaved.vec = shufflevector <2 x float> %6, <2 x float> %7, <4 x i32> + ret <4 x float> %interleaved.vec +} + + +; Expected to transform +define <4 x float> @mul_mulequal(<4 x float> %a, <4 x float> %b, <4 x float> %c) { +; CHECK-LABEL: mul_mulequal: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov d0, r0, r1 +; CHECK-NEXT: mov r1, sp +; CHECK-NEXT: vldrw.u32 q2, [r1] +; CHECK-NEXT: vmov d1, r2, r3 +; CHECK-NEXT: add r0, sp, #16 +; CHECK-NEXT: vcmul.f32 q3, q0, q2, #0 +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vcmla.f32 q3, q0, q2, #90 +; CHECK-NEXT: vmul.f32 q0, q3, q1 +; CHECK-NEXT: vmov r0, r1, d0 +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: bx lr +entry: + %strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> + %a.imag = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> + %b.real = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> + %b.imag = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> + %0 = fmul fast <2 x float> %b.imag, %strided.vec + %1 = fmul fast <2 x float> %b.real, %a.imag + %2 = fadd fast <2 x float> %1, %0 + %3 = fmul fast <2 x float> %b.real, %strided.vec + %4 = fmul fast <2 x float> %a.imag, %b.imag + %5 = fsub fast <2 x float> %3, %4 + %c.real = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> + %c.imag = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> + %6 = fmul fast <2 x float> %5, %c.real + %7 = fmul fast <2 x float> %2, %c.imag + %interleaved.vec = shufflevector <2 x float> %6, <2 x float> %7, <4 x i32> + ret <4 x float> %interleaved.vec +} + +; Expected to not transform +define <4 x float> @mul_divequal(<4 x float> %a, <4 x float> %b, <4 x float> %c) { +; CHECK-LABEL: mul_divequal: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .vsave {d10, d11} +; CHECK-NEXT: vpush {d10, d11} +; CHECK-NEXT: .vsave {d8} +; CHECK-NEXT: vpush {d8} +; CHECK-NEXT: vmov d0, r0, r1 +; CHECK-NEXT: add r0, sp, #24 +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vmov d1, r2, r3 +; CHECK-NEXT: vmov.f32 s16, s1 +; CHECK-NEXT: add.w r12, sp, #40 +; CHECK-NEXT: vmov.f32 s12, s5 +; CHECK-NEXT: vmov.f32 s13, s7 +; CHECK-NEXT: vmov.f32 s1, s2 +; CHECK-NEXT: vmov.f32 s8, s4 +; CHECK-NEXT: vmul.f32 q5, q3, q0 +; CHECK-NEXT: vmov.f32 s9, s6 +; CHECK-NEXT: vldrw.u32 q1, [r12] +; CHECK-NEXT: vmov.f32 s17, s3 +; CHECK-NEXT: vfma.f32 q5, q2, q4 +; CHECK-NEXT: vmul.f32 q3, q4, q3 +; CHECK-NEXT: vdiv.f32 s3, s21, s7 +; CHECK-NEXT: vneg.f32 q3, q3 +; CHECK-NEXT: vfma.f32 q3, q2, q0 +; CHECK-NEXT: vdiv.f32 s1, s20, s5 +; CHECK-NEXT: vdiv.f32 s2, s13, s6 +; CHECK-NEXT: vdiv.f32 s0, s12, s4 +; CHECK-NEXT: vmov r0, r1, d0 +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: vpop {d8} +; CHECK-NEXT: vpop {d10, d11} +; CHECK-NEXT: bx lr +entry: + %strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> + %a.imag = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> + %b.real = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> + %b.imag = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> + %0 = fmul fast <2 x float> %b.imag, %strided.vec + %1 = fmul fast <2 x float> %b.real, %a.imag + %2 = fadd fast <2 x float> %1, %0 + %3 = fmul fast <2 x float> %b.real, %strided.vec + %4 = fmul fast <2 x float> %a.imag, %b.imag + %5 = fsub fast <2 x float> %3, %4 + %c.real = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> + %c.imag = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> + %6 = fdiv fast <2 x float> %5, %c.real + %7 = fdiv fast <2 x float> %2, %c.imag + %interleaved.vec = shufflevector <2 x float> %6, <2 x float> %7, <4 x i32> + ret <4 x float> %interleaved.vec +} + +; Expected to transform +define <4 x float> @mul_negequal(<4 x float> %a, <4 x float> %b) { +; CHECK-LABEL: mul_negequal: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov d0, r0, r1 +; CHECK-NEXT: mov r0, sp +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vmov d1, r2, r3 +; CHECK-NEXT: vcmul.f32 q2, q0, q1, #0 +; CHECK-NEXT: vcmla.f32 q2, q0, q1, #90 +; CHECK-NEXT: vneg.f32 q0, q2 +; CHECK-NEXT: vmov r0, r1, d0 +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: bx lr +entry: + %strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> + %a.imag = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> + %b.real = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> + %b.imag = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> + %0 = fmul fast <2 x float> %b.imag, %strided.vec + %1 = fmul fast <2 x float> %b.real, %a.imag + %2 = fadd fast <2 x float> %1, %0 + %3 = fmul fast <2 x float> %b.real, %strided.vec + %4 = fmul fast <2 x float> %a.imag, %b.imag + %5 = fsub fast <2 x float> %3, %4 + %6 = fneg fast <2 x float> %5 + %7 = fneg fast <2 x float> %2 + %interleaved.vec = shufflevector <2 x float> %6, <2 x float> %7, <4 x i32> + ret <4 x float> %interleaved.vec +}