diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp index 09b8864dfd7e9..43aa37a25f882 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp @@ -3435,8 +3435,7 @@ static SDValue performSETCCCombine(SDNode *N, return SDValue(); } -static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG) { - assert(N->getOpcode() == ISD::MUL); +static SDValue TryWideExtMulCombine(SDNode *N, SelectionDAG &DAG) { EVT VT = N->getValueType(0); if (VT != MVT::v8i32 && VT != MVT::v16i32) return SDValue(); @@ -3522,6 +3521,46 @@ static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG) { return SDValue(); } +static SDValue performMulCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI) { + assert(N->getOpcode() == ISD::MUL); + EVT VT = N->getValueType(0); + if (!VT.isVector()) + return SDValue(); + + if (auto Res = TryWideExtMulCombine(N, DCI.DAG)) + return Res; + + // We don't natively support v16i8 mul, but we do support v8i16 so split the + // inputs and extend them to v8i16. Only do this before legalization in case + // a narrow vector is widened and may be simplified later. + if (!DCI.isBeforeLegalize() || VT != MVT::v16i8) + return SDValue(); + + SDLoc DL(N); + SelectionDAG &DAG = DCI.DAG; + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + SDValue LowLHS = + DAG.getNode(WebAssemblyISD::EXTEND_LOW_U, DL, MVT::v8i16, LHS); + SDValue HighLHS = + DAG.getNode(WebAssemblyISD::EXTEND_HIGH_U, DL, MVT::v8i16, LHS); + SDValue LowRHS = + DAG.getNode(WebAssemblyISD::EXTEND_LOW_U, DL, MVT::v8i16, RHS); + SDValue HighRHS = + DAG.getNode(WebAssemblyISD::EXTEND_HIGH_U, DL, MVT::v8i16, RHS); + + SDValue MulLow = + DAG.getBitcast(VT, DAG.getNode(ISD::MUL, DL, MVT::v8i16, LowLHS, LowRHS)); + SDValue MulHigh = DAG.getBitcast( + VT, DAG.getNode(ISD::MUL, DL, MVT::v8i16, HighLHS, HighRHS)); + + // Take the low byte of each lane. + return DAG.getVectorShuffle( + VT, DL, MulLow, MulHigh, + {0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30}); +} + SDValue WebAssemblyTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { @@ -3556,6 +3595,6 @@ WebAssemblyTargetLowering::PerformDAGCombine(SDNode *N, return performLowerPartialReduction(N, DCI.DAG); } case ISD::MUL: - return performMulCombine(N, DCI.DAG); + return performMulCombine(N, DCI); } } diff --git a/llvm/test/CodeGen/WebAssembly/simd-arith.ll b/llvm/test/CodeGen/WebAssembly/simd-arith.ll index e3607e12bf530..36637e1d555bd 100644 --- a/llvm/test/CodeGen/WebAssembly/simd-arith.ll +++ b/llvm/test/CodeGen/WebAssembly/simd-arith.ll @@ -199,139 +199,17 @@ define <16 x i8> @mul_v16i8(<16 x i8> %x, <16 x i8> %y) { ; SIMD128-LABEL: mul_v16i8: ; SIMD128: .functype mul_v16i8 (v128, v128) -> (v128) ; SIMD128-NEXT: # %bb.0: -; SIMD128-NEXT: i8x16.extract_lane_u $push4=, $0, 0 -; SIMD128-NEXT: i8x16.extract_lane_u $push3=, $1, 0 -; SIMD128-NEXT: i32.mul $push5=, $pop4, $pop3 -; SIMD128-NEXT: i8x16.splat $push6=, $pop5 -; SIMD128-NEXT: i8x16.extract_lane_u $push1=, $0, 1 -; SIMD128-NEXT: i8x16.extract_lane_u $push0=, $1, 1 -; SIMD128-NEXT: i32.mul $push2=, $pop1, $pop0 -; SIMD128-NEXT: i8x16.replace_lane $push7=, $pop6, 1, $pop2 -; SIMD128-NEXT: i8x16.extract_lane_u $push9=, $0, 2 -; SIMD128-NEXT: i8x16.extract_lane_u $push8=, $1, 2 -; SIMD128-NEXT: i32.mul $push10=, $pop9, $pop8 -; SIMD128-NEXT: i8x16.replace_lane $push11=, $pop7, 2, $pop10 -; SIMD128-NEXT: i8x16.extract_lane_u $push13=, $0, 3 -; SIMD128-NEXT: i8x16.extract_lane_u $push12=, $1, 3 -; SIMD128-NEXT: i32.mul $push14=, $pop13, $pop12 -; SIMD128-NEXT: i8x16.replace_lane $push15=, $pop11, 3, $pop14 -; SIMD128-NEXT: i8x16.extract_lane_u $push17=, $0, 4 -; SIMD128-NEXT: i8x16.extract_lane_u $push16=, $1, 4 -; SIMD128-NEXT: i32.mul $push18=, $pop17, $pop16 -; SIMD128-NEXT: i8x16.replace_lane $push19=, $pop15, 4, $pop18 -; SIMD128-NEXT: i8x16.extract_lane_u $push21=, $0, 5 -; SIMD128-NEXT: i8x16.extract_lane_u $push20=, $1, 5 -; SIMD128-NEXT: i32.mul $push22=, $pop21, $pop20 -; SIMD128-NEXT: i8x16.replace_lane $push23=, $pop19, 5, $pop22 -; SIMD128-NEXT: i8x16.extract_lane_u $push25=, $0, 6 -; SIMD128-NEXT: i8x16.extract_lane_u $push24=, $1, 6 -; SIMD128-NEXT: i32.mul $push26=, $pop25, $pop24 -; SIMD128-NEXT: i8x16.replace_lane $push27=, $pop23, 6, $pop26 -; SIMD128-NEXT: i8x16.extract_lane_u $push29=, $0, 7 -; SIMD128-NEXT: i8x16.extract_lane_u $push28=, $1, 7 -; SIMD128-NEXT: i32.mul $push30=, $pop29, $pop28 -; SIMD128-NEXT: i8x16.replace_lane $push31=, $pop27, 7, $pop30 -; SIMD128-NEXT: i8x16.extract_lane_u $push33=, $0, 8 -; SIMD128-NEXT: i8x16.extract_lane_u $push32=, $1, 8 -; SIMD128-NEXT: i32.mul $push34=, $pop33, $pop32 -; SIMD128-NEXT: i8x16.replace_lane $push35=, $pop31, 8, $pop34 -; SIMD128-NEXT: i8x16.extract_lane_u $push37=, $0, 9 -; SIMD128-NEXT: i8x16.extract_lane_u $push36=, $1, 9 -; SIMD128-NEXT: i32.mul $push38=, $pop37, $pop36 -; SIMD128-NEXT: i8x16.replace_lane $push39=, $pop35, 9, $pop38 -; SIMD128-NEXT: i8x16.extract_lane_u $push41=, $0, 10 -; SIMD128-NEXT: i8x16.extract_lane_u $push40=, $1, 10 -; SIMD128-NEXT: i32.mul $push42=, $pop41, $pop40 -; SIMD128-NEXT: i8x16.replace_lane $push43=, $pop39, 10, $pop42 -; SIMD128-NEXT: i8x16.extract_lane_u $push45=, $0, 11 -; SIMD128-NEXT: i8x16.extract_lane_u $push44=, $1, 11 -; SIMD128-NEXT: i32.mul $push46=, $pop45, $pop44 -; SIMD128-NEXT: i8x16.replace_lane $push47=, $pop43, 11, $pop46 -; SIMD128-NEXT: i8x16.extract_lane_u $push49=, $0, 12 -; SIMD128-NEXT: i8x16.extract_lane_u $push48=, $1, 12 -; SIMD128-NEXT: i32.mul $push50=, $pop49, $pop48 -; SIMD128-NEXT: i8x16.replace_lane $push51=, $pop47, 12, $pop50 -; SIMD128-NEXT: i8x16.extract_lane_u $push53=, $0, 13 -; SIMD128-NEXT: i8x16.extract_lane_u $push52=, $1, 13 -; SIMD128-NEXT: i32.mul $push54=, $pop53, $pop52 -; SIMD128-NEXT: i8x16.replace_lane $push55=, $pop51, 13, $pop54 -; SIMD128-NEXT: i8x16.extract_lane_u $push57=, $0, 14 -; SIMD128-NEXT: i8x16.extract_lane_u $push56=, $1, 14 -; SIMD128-NEXT: i32.mul $push58=, $pop57, $pop56 -; SIMD128-NEXT: i8x16.replace_lane $push59=, $pop55, 14, $pop58 -; SIMD128-NEXT: i8x16.extract_lane_u $push61=, $0, 15 -; SIMD128-NEXT: i8x16.extract_lane_u $push60=, $1, 15 -; SIMD128-NEXT: i32.mul $push62=, $pop61, $pop60 -; SIMD128-NEXT: i8x16.replace_lane $push63=, $pop59, 15, $pop62 -; SIMD128-NEXT: return $pop63 +; SIMD128-NEXT: i16x8.extmul_low_i8x16_u $push1=, $0, $1 +; SIMD128-NEXT: i16x8.extmul_high_i8x16_u $push0=, $0, $1 +; SIMD128-NEXT: i8x16.shuffle $push2=, $pop1, $pop0, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 +; SIMD128-NEXT: return $pop2 ; ; SIMD128-FAST-LABEL: mul_v16i8: ; SIMD128-FAST: .functype mul_v16i8 (v128, v128) -> (v128) ; SIMD128-FAST-NEXT: # %bb.0: -; SIMD128-FAST-NEXT: i8x16.extract_lane_u $push5=, $0, 0 -; SIMD128-FAST-NEXT: i8x16.extract_lane_u $push4=, $1, 0 -; SIMD128-FAST-NEXT: i32.mul $push6=, $pop5, $pop4 -; SIMD128-FAST-NEXT: i8x16.splat $push7=, $pop6 -; SIMD128-FAST-NEXT: i8x16.extract_lane_u $push2=, $0, 1 -; SIMD128-FAST-NEXT: i8x16.extract_lane_u $push1=, $1, 1 -; SIMD128-FAST-NEXT: i32.mul $push3=, $pop2, $pop1 -; SIMD128-FAST-NEXT: i8x16.replace_lane $push8=, $pop7, 1, $pop3 -; SIMD128-FAST-NEXT: i8x16.extract_lane_u $push10=, $0, 2 -; SIMD128-FAST-NEXT: i8x16.extract_lane_u $push9=, $1, 2 -; SIMD128-FAST-NEXT: i32.mul $push11=, $pop10, $pop9 -; SIMD128-FAST-NEXT: i8x16.replace_lane $push12=, $pop8, 2, $pop11 -; SIMD128-FAST-NEXT: i8x16.extract_lane_u $push14=, $0, 3 -; SIMD128-FAST-NEXT: i8x16.extract_lane_u $push13=, $1, 3 -; SIMD128-FAST-NEXT: i32.mul $push15=, $pop14, $pop13 -; SIMD128-FAST-NEXT: i8x16.replace_lane $push16=, $pop12, 3, $pop15 -; SIMD128-FAST-NEXT: i8x16.extract_lane_u $push18=, $0, 4 -; SIMD128-FAST-NEXT: i8x16.extract_lane_u $push17=, $1, 4 -; SIMD128-FAST-NEXT: i32.mul $push19=, $pop18, $pop17 -; SIMD128-FAST-NEXT: i8x16.replace_lane $push20=, $pop16, 4, $pop19 -; SIMD128-FAST-NEXT: i8x16.extract_lane_u $push22=, $0, 5 -; SIMD128-FAST-NEXT: i8x16.extract_lane_u $push21=, $1, 5 -; SIMD128-FAST-NEXT: i32.mul $push23=, $pop22, $pop21 -; SIMD128-FAST-NEXT: i8x16.replace_lane $push24=, $pop20, 5, $pop23 -; SIMD128-FAST-NEXT: i8x16.extract_lane_u $push26=, $0, 6 -; SIMD128-FAST-NEXT: i8x16.extract_lane_u $push25=, $1, 6 -; SIMD128-FAST-NEXT: i32.mul $push27=, $pop26, $pop25 -; SIMD128-FAST-NEXT: i8x16.replace_lane $push28=, $pop24, 6, $pop27 -; SIMD128-FAST-NEXT: i8x16.extract_lane_u $push30=, $0, 7 -; SIMD128-FAST-NEXT: i8x16.extract_lane_u $push29=, $1, 7 -; SIMD128-FAST-NEXT: i32.mul $push31=, $pop30, $pop29 -; SIMD128-FAST-NEXT: i8x16.replace_lane $push32=, $pop28, 7, $pop31 -; SIMD128-FAST-NEXT: i8x16.extract_lane_u $push34=, $0, 8 -; SIMD128-FAST-NEXT: i8x16.extract_lane_u $push33=, $1, 8 -; SIMD128-FAST-NEXT: i32.mul $push35=, $pop34, $pop33 -; SIMD128-FAST-NEXT: i8x16.replace_lane $push36=, $pop32, 8, $pop35 -; SIMD128-FAST-NEXT: i8x16.extract_lane_u $push38=, $0, 9 -; SIMD128-FAST-NEXT: i8x16.extract_lane_u $push37=, $1, 9 -; SIMD128-FAST-NEXT: i32.mul $push39=, $pop38, $pop37 -; SIMD128-FAST-NEXT: i8x16.replace_lane $push40=, $pop36, 9, $pop39 -; SIMD128-FAST-NEXT: i8x16.extract_lane_u $push42=, $0, 10 -; SIMD128-FAST-NEXT: i8x16.extract_lane_u $push41=, $1, 10 -; SIMD128-FAST-NEXT: i32.mul $push43=, $pop42, $pop41 -; SIMD128-FAST-NEXT: i8x16.replace_lane $push44=, $pop40, 10, $pop43 -; SIMD128-FAST-NEXT: i8x16.extract_lane_u $push46=, $0, 11 -; SIMD128-FAST-NEXT: i8x16.extract_lane_u $push45=, $1, 11 -; SIMD128-FAST-NEXT: i32.mul $push47=, $pop46, $pop45 -; SIMD128-FAST-NEXT: i8x16.replace_lane $push48=, $pop44, 11, $pop47 -; SIMD128-FAST-NEXT: i8x16.extract_lane_u $push50=, $0, 12 -; SIMD128-FAST-NEXT: i8x16.extract_lane_u $push49=, $1, 12 -; SIMD128-FAST-NEXT: i32.mul $push51=, $pop50, $pop49 -; SIMD128-FAST-NEXT: i8x16.replace_lane $push52=, $pop48, 12, $pop51 -; SIMD128-FAST-NEXT: i8x16.extract_lane_u $push54=, $0, 13 -; SIMD128-FAST-NEXT: i8x16.extract_lane_u $push53=, $1, 13 -; SIMD128-FAST-NEXT: i32.mul $push55=, $pop54, $pop53 -; SIMD128-FAST-NEXT: i8x16.replace_lane $push56=, $pop52, 13, $pop55 -; SIMD128-FAST-NEXT: i8x16.extract_lane_u $push58=, $0, 14 -; SIMD128-FAST-NEXT: i8x16.extract_lane_u $push57=, $1, 14 -; SIMD128-FAST-NEXT: i32.mul $push59=, $pop58, $pop57 -; SIMD128-FAST-NEXT: i8x16.replace_lane $push60=, $pop56, 14, $pop59 -; SIMD128-FAST-NEXT: i8x16.extract_lane_u $push62=, $0, 15 -; SIMD128-FAST-NEXT: i8x16.extract_lane_u $push61=, $1, 15 -; SIMD128-FAST-NEXT: i32.mul $push63=, $pop62, $pop61 -; SIMD128-FAST-NEXT: i8x16.replace_lane $push0=, $pop60, 15, $pop63 +; SIMD128-FAST-NEXT: i16x8.extmul_low_i8x16_u $push2=, $0, $1 +; SIMD128-FAST-NEXT: i16x8.extmul_high_i8x16_u $push1=, $0, $1 +; SIMD128-FAST-NEXT: i8x16.shuffle $push0=, $pop2, $pop1, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 ; SIMD128-FAST-NEXT: return $pop0 ; ; NO-SIMD128-LABEL: mul_v16i8: diff --git a/llvm/test/CodeGen/WebAssembly/vector-reduce.ll b/llvm/test/CodeGen/WebAssembly/vector-reduce.ll index 1d194b640eab2..4c30a3adf2378 100644 --- a/llvm/test/CodeGen/WebAssembly/vector-reduce.ll +++ b/llvm/test/CodeGen/WebAssembly/vector-reduce.ll @@ -116,40 +116,28 @@ define i8 @pairwise_mul_v16i8(<16 x i8> %arg) { ; SIMD128-LABEL: pairwise_mul_v16i8: ; SIMD128: .functype pairwise_mul_v16i8 (v128) -> (i32) ; SIMD128-NEXT: # %bb.0: -; SIMD128-NEXT: i8x16.extract_lane_u $push26=, $0, 0 -; SIMD128-NEXT: i8x16.shuffle $push32=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0 -; SIMD128-NEXT: local.tee $push31=, $1=, $pop32 -; SIMD128-NEXT: i8x16.extract_lane_u $push25=, $pop31, 0 -; SIMD128-NEXT: i32.mul $push27=, $pop26, $pop25 -; SIMD128-NEXT: i8x16.extract_lane_u $push23=, $0, 4 -; SIMD128-NEXT: i8x16.extract_lane_u $push22=, $1, 4 -; SIMD128-NEXT: i32.mul $push24=, $pop23, $pop22 -; SIMD128-NEXT: i32.mul $push28=, $pop27, $pop24 -; SIMD128-NEXT: i8x16.extract_lane_u $push19=, $0, 2 -; SIMD128-NEXT: i8x16.extract_lane_u $push18=, $1, 2 -; SIMD128-NEXT: i32.mul $push20=, $pop19, $pop18 -; SIMD128-NEXT: i8x16.extract_lane_u $push16=, $0, 6 -; SIMD128-NEXT: i8x16.extract_lane_u $push15=, $1, 6 -; SIMD128-NEXT: i32.mul $push17=, $pop16, $pop15 -; SIMD128-NEXT: i32.mul $push21=, $pop20, $pop17 -; SIMD128-NEXT: i32.mul $push29=, $pop28, $pop21 -; SIMD128-NEXT: i8x16.extract_lane_u $push11=, $0, 1 -; SIMD128-NEXT: i8x16.extract_lane_u $push10=, $1, 1 -; SIMD128-NEXT: i32.mul $push12=, $pop11, $pop10 -; SIMD128-NEXT: i8x16.extract_lane_u $push8=, $0, 5 -; SIMD128-NEXT: i8x16.extract_lane_u $push7=, $1, 5 -; SIMD128-NEXT: i32.mul $push9=, $pop8, $pop7 -; SIMD128-NEXT: i32.mul $push13=, $pop12, $pop9 -; SIMD128-NEXT: i8x16.extract_lane_u $push4=, $0, 3 -; SIMD128-NEXT: i8x16.extract_lane_u $push3=, $1, 3 -; SIMD128-NEXT: i32.mul $push5=, $pop4, $pop3 -; SIMD128-NEXT: i8x16.extract_lane_u $push1=, $0, 7 -; SIMD128-NEXT: i8x16.extract_lane_u $push0=, $1, 7 -; SIMD128-NEXT: i32.mul $push2=, $pop1, $pop0 -; SIMD128-NEXT: i32.mul $push6=, $pop5, $pop2 -; SIMD128-NEXT: i32.mul $push14=, $pop13, $pop6 -; SIMD128-NEXT: i32.mul $push30=, $pop29, $pop14 -; SIMD128-NEXT: return $pop30 +; SIMD128-NEXT: i8x16.shuffle $push20=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0 +; SIMD128-NEXT: local.tee $push19=, $1=, $pop20 +; SIMD128-NEXT: i16x8.extmul_low_i8x16_u $push1=, $0, $pop19 +; SIMD128-NEXT: i16x8.extmul_high_i8x16_u $push0=, $0, $1 +; SIMD128-NEXT: i8x16.shuffle $push18=, $pop1, $pop0, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 +; SIMD128-NEXT: local.tee $push17=, $0=, $pop18 +; SIMD128-NEXT: i8x16.shuffle $push16=, $0, $0, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; SIMD128-NEXT: local.tee $push15=, $1=, $pop16 +; SIMD128-NEXT: i16x8.extmul_low_i8x16_u $push3=, $pop17, $pop15 +; SIMD128-NEXT: i16x8.extmul_high_i8x16_u $push2=, $0, $1 +; SIMD128-NEXT: i8x16.shuffle $push14=, $pop3, $pop2, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 +; SIMD128-NEXT: local.tee $push13=, $0=, $pop14 +; SIMD128-NEXT: i8x16.shuffle $push12=, $0, $0, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; SIMD128-NEXT: local.tee $push11=, $1=, $pop12 +; SIMD128-NEXT: i16x8.extmul_low_i8x16_u $push5=, $pop13, $pop11 +; SIMD128-NEXT: i16x8.extmul_high_i8x16_u $push4=, $0, $1 +; SIMD128-NEXT: i8x16.shuffle $push10=, $pop5, $pop4, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 +; SIMD128-NEXT: local.tee $push9=, $0=, $pop10 +; SIMD128-NEXT: i8x16.shuffle $push6=, $0, $0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; SIMD128-NEXT: i16x8.extmul_low_i8x16_u $push7=, $pop9, $pop6 +; SIMD128-NEXT: i8x16.extract_lane_u $push8=, $pop7, 0 +; SIMD128-NEXT: return $pop8 %res = tail call i8 @llvm.vector.reduce.mul.v16i8(<16 x i8> %arg) ret i8 %res }