diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp index 651c504efc067..3f4ebd5015953 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp @@ -1677,19 +1677,13 @@ SDValue WebAssemblyTargetLowering::LowerShift(SDValue Op, // Only manually lower vector shifts assert(Op.getSimpleValueType().isVector()); - // Unroll non-splat vector shifts - BuildVectorSDNode *ShiftVec; - SDValue SplatVal; - if (!(ShiftVec = dyn_cast(Op.getOperand(1).getNode())) || - !(SplatVal = ShiftVec->getSplatValue())) + auto ShiftVal = Op.getOperand(1); + if (!DAG.isSplatValue(ShiftVal, /*AllowUndefs=*/true)) return unrollVectorShift(Op, DAG); - // All splats except i64x2 const splats are handled by patterns - auto *SplatConst = dyn_cast(SplatVal); - if (!SplatConst || Op.getSimpleValueType() != MVT::v2i64) - return Op; + auto SplatVal = DAG.getSplatValue(ShiftVal); + assert(SplatVal != SDValue()); - // i64x2 const splats are custom lowered to avoid unnecessary wraps unsigned Opcode; switch (Op.getOpcode()) { case ISD::SHL: @@ -1704,9 +1698,11 @@ SDValue WebAssemblyTargetLowering::LowerShift(SDValue Op, default: llvm_unreachable("unexpected opcode"); } - APInt Shift = SplatConst->getAPIntValue().zextOrTrunc(32); + + // Use anyext because none of the high bits can affect the shift + auto ScalarShift = DAG.getAnyExtOrTrunc(SplatVal, DL, MVT::i32); return DAG.getNode(Opcode, DL, Op.getValueType(), Op.getOperand(0), - DAG.getConstant(Shift, DL, MVT::i32)); + ScalarShift); } //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td index b4a8a7bc42ae8..814bb80fb693b 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td +++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td @@ -654,55 +654,36 @@ defm BITMASK : SIMDBitmask; // Bit shifts //===----------------------------------------------------------------------===// -multiclass SIMDShift simdop> { +multiclass SIMDShift simdop> { defm _#vec_t : SIMD_I<(outs V128:$dst), (ins V128:$vec, I32:$x), (outs), (ins), - [(set (vec_t V128:$dst), - (node V128:$vec, (vec_t shift_vec)))], + [(set (vec_t V128:$dst), (node V128:$vec, I32:$x))], vec#"."#name#"\t$dst, $vec, $x", vec#"."#name, simdop>; } multiclass SIMDShiftInt baseInst> { - defm "" : SIMDShift; - defm "" : SIMDShift; - defm "" : SIMDShift; - defm "" : SIMDShift; + defm "" : SIMDShift; + defm "" : SIMDShift; + defm "" : SIMDShift; + defm "" : SIMDShift; } -// Left shift by scalar: shl -defm SHL : SIMDShiftInt; - -// Right shift by scalar: shr_s / shr_u -defm SHR_S : SIMDShiftInt; -defm SHR_U : SIMDShiftInt; - -// Truncate i64 shift operands to i32s, except if they are already i32s -foreach shifts = [[shl, SHL_v2i64], [sra, SHR_S_v2i64], [srl, SHR_U_v2i64]] in { -def : Pat<(v2i64 (shifts[0] - (v2i64 V128:$vec), - (v2i64 (splat2 (i64 (sext I32:$x)))) - )), - (v2i64 (shifts[1] (v2i64 V128:$vec), (i32 I32:$x)))>; -def : Pat<(v2i64 (shifts[0] (v2i64 V128:$vec), (v2i64 (splat2 I64:$x)))), - (v2i64 (shifts[1] (v2i64 V128:$vec), (I32_WRAP_I64 I64:$x)))>; -} - -// 2xi64 shifts with constant shift amounts are custom lowered to avoid wrapping +// WebAssembly SIMD shifts are nonstandard in that the shift amount is +// an i32 rather than a vector, so they need custom nodes. def wasm_shift_t : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0, 1>, SDTCisVT<2, i32>] >; def wasm_shl : SDNode<"WebAssemblyISD::VEC_SHL", wasm_shift_t>; def wasm_shr_s : SDNode<"WebAssemblyISD::VEC_SHR_S", wasm_shift_t>; def wasm_shr_u : SDNode<"WebAssemblyISD::VEC_SHR_U", wasm_shift_t>; -foreach shifts = [[wasm_shl, SHL_v2i64], - [wasm_shr_s, SHR_S_v2i64], - [wasm_shr_u, SHR_U_v2i64]] in -def : Pat<(v2i64 (shifts[0] (v2i64 V128:$vec), I32:$x)), - (v2i64 (shifts[1] (v2i64 V128:$vec), I32:$x))>; + +// Left shift by scalar: shl +defm SHL : SIMDShiftInt; + +// Right shift by scalar: shr_s / shr_u +defm SHR_S : SIMDShiftInt; +defm SHR_U : SIMDShiftInt; //===----------------------------------------------------------------------===// // Integer binary arithmetic @@ -766,12 +747,12 @@ def add_nuw : PatFrag<(ops node:$lhs, node:$rhs), "return N->getFlags().hasNoUnsignedWrap();">; foreach nodes = [[v16i8, splat16], [v8i16, splat8]] in -def : Pat<(srl +def : Pat<(wasm_shr_u (add_nuw (add_nuw (nodes[0] V128:$lhs), (nodes[0] V128:$rhs)), (nodes[1] (i32 1)) ), - (nodes[0] (nodes[1] (i32 1))) + (i32 1) ), (!cast("AVGR_U_"#nodes[0]) V128:$lhs, V128:$rhs)>; diff --git a/llvm/test/CodeGen/WebAssembly/simd-shift-complex-splats.ll b/llvm/test/CodeGen/WebAssembly/simd-shift-complex-splats.ll new file mode 100644 index 0000000000000..ded430f895458 --- /dev/null +++ b/llvm/test/CodeGen/WebAssembly/simd-shift-complex-splats.ll @@ -0,0 +1,27 @@ +; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+simd128 | FileCheck %s + +; Test that SIMD shifts can be lowered correctly even with shift +; values that are more complex than plain splats. + +target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128" +target triple = "wasm32-unknown-unknown" + +;; TODO: Optimize this further by scalarizing the add + +; CHECK-LABEL: shl_add: +; CHECK-NEXT: .functype shl_add (v128, i32, i32) -> (v128) +; CHECK-NEXT: i8x16.splat $push1=, $1 +; CHECK-NEXT: i8x16.splat $push0=, $2 +; CHECK-NEXT: i8x16.add $push2=, $pop1, $pop0 +; CHECK-NEXT: i8x16.extract_lane_u $push3=, $pop2, 0 +; CHECK-NEXT: i8x16.shl $push4=, $0, $pop3 +; CHECK-NEXT: return $pop4 +define <16 x i8> @shl_add(<16 x i8> %v, i8 %a, i8 %b) { + %t1 = insertelement <16 x i8> undef, i8 %a, i32 0 + %va = shufflevector <16 x i8> %t1, <16 x i8> undef, <16 x i32> zeroinitializer + %t2 = insertelement <16 x i8> undef, i8 %b, i32 0 + %vb = shufflevector <16 x i8> %t2, <16 x i8> undef, <16 x i32> zeroinitializer + %shift = add <16 x i8> %va, %vb + %r = shl <16 x i8> %v, %shift + ret <16 x i8> %r +} diff --git a/llvm/test/CodeGen/WebAssembly/simd-shift-unroll.ll b/llvm/test/CodeGen/WebAssembly/simd-shift-unroll.ll deleted file mode 100644 index 2a5422cb01105..0000000000000 --- a/llvm/test/CodeGen/WebAssembly/simd-shift-unroll.ll +++ /dev/null @@ -1,128 +0,0 @@ -; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+unimplemented-simd128 | FileCheck %s --check-prefixes CHECK,SIMD128,SIMD128-SLOW - -;; Test that the custom shift unrolling works correctly in cases that -;; cause assertion failures due to illegal types when using -;; DAG.UnrollVectorOp. Regression test for PR45178. - -target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128" -target triple = "wasm32-unknown-unknown" - -; CHECK-LABEL: shl_v16i8: -; CHECK-NEXT: .functype shl_v16i8 (v128) -> (v128) -; CHECK-NEXT: i8x16.extract_lane_u $push0=, $0, 0 -; CHECK-NEXT: i32.const $push1=, 3 -; CHECK-NEXT: i32.shl $push2=, $pop0, $pop1 -; CHECK-NEXT: i8x16.splat $push3=, $pop2 -; CHECK-NEXT: i8x16.extract_lane_u $push4=, $0, 1 -; CHECK-NEXT: i8x16.replace_lane $push5=, $pop3, 1, $pop4 -; ... -; CHECK: i8x16.extract_lane_u $push32=, $0, 15 -; CHECK-NEXT: i8x16.replace_lane $push33=, $pop31, 15, $pop32 -; CHECK-NEXT: v8x16.shuffle $push34=, $pop33, $0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 -; CHECK-NEXT: return $pop34 -define <16 x i8> @shl_v16i8(<16 x i8> %in) { - %out = shl <16 x i8> %in, - - %ret = shufflevector <16 x i8> %out, <16 x i8> undef, <16 x i32> zeroinitializer - ret <16 x i8> %ret -} - -; CHECK-LABEL: shr_s_v16i8: -; CHECK-NEXT: functype shr_s_v16i8 (v128) -> (v128) -; CHECK-NEXT: i8x16.extract_lane_s $push0=, $0, 0 -; CHECK-NEXT: i32.const $push1=, 3 -; CHECK-NEXT: i32.shr_s $push2=, $pop0, $pop1 -; CHECK-NEXT: i8x16.splat $push3=, $pop2 -; CHECK-NEXT: i8x16.extract_lane_s $push4=, $0, 1 -; CHECK-NEXT: i8x16.replace_lane $push5=, $pop3, 1, $pop4 -; ... -; CHECK: i8x16.extract_lane_s $push32=, $0, 15 -; CHECK-NEXT: i8x16.replace_lane $push33=, $pop31, 15, $pop32 -; CHECK-NEXT: v8x16.shuffle $push34=, $pop33, $0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 -; CHECK-NEXT: return $pop34 -define <16 x i8> @shr_s_v16i8(<16 x i8> %in) { - %out = ashr <16 x i8> %in, - - %ret = shufflevector <16 x i8> %out, <16 x i8> undef, <16 x i32> zeroinitializer - ret <16 x i8> %ret -} - -; CHECK-LABEL: shr_u_v16i8: -; CHECK-NEXT: functype shr_u_v16i8 (v128) -> (v128) -; CHECK-NEXT: i8x16.extract_lane_u $push0=, $0, 0 -; CHECK-NEXT: i32.const $push1=, 3 -; CHECK-NEXT: i32.shr_u $push2=, $pop0, $pop1 -; CHECK-NEXT: i8x16.splat $push3=, $pop2 -; CHECK-NEXT: i8x16.extract_lane_u $push4=, $0, 1 -; CHECK-NEXT: i8x16.replace_lane $push5=, $pop3, 1, $pop4 -; ... -; CHECK: i8x16.extract_lane_u $push32=, $0, 15 -; CHECK-NEXT: i8x16.replace_lane $push33=, $pop31, 15, $pop32 -; CHECK-NEXT: v8x16.shuffle $push34=, $pop33, $0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 -; CHECK-NEXT: return $pop34 -define <16 x i8> @shr_u_v16i8(<16 x i8> %in) { - %out = lshr <16 x i8> %in, - - %ret = shufflevector <16 x i8> %out, <16 x i8> undef, <16 x i32> zeroinitializer - ret <16 x i8> %ret -} - -; CHECK-LABEL: shl_v8i16: -; CHECK-NEXT: functype shl_v8i16 (v128) -> (v128) -; CHECK-NEXT: i16x8.extract_lane_u $push0=, $0, 0 -; CHECK-NEXT: i32.const $push1=, 9 -; CHECK-NEXT: i32.shl $push2=, $pop0, $pop1 -; CHECK-NEXT: i16x8.splat $push3=, $pop2 -; CHECK-NEXT: i16x8.extract_lane_u $push4=, $0, 1 -; CHECK-NEXT: i16x8.replace_lane $push5=, $pop3, 1, $pop4 -; ... -; CHECK: i16x8.extract_lane_u $push16=, $0, 7 -; CHECK-NEXT: i16x8.replace_lane $push17=, $pop15, 7, $pop16 -; CHECK-NEXT: v8x16.shuffle $push18=, $pop17, $0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 -; CHECK-NEXT: return $pop18 -define <8 x i16> @shl_v8i16(<8 x i16> %in) { - %out = shl <8 x i16> %in, - %ret = shufflevector <8 x i16> %out, <8 x i16> undef, <8 x i32> zeroinitializer - ret <8 x i16> %ret -} - -; CHECK-LABEL: shr_s_v8i16: -; CHECK-NEXT: functype shr_s_v8i16 (v128) -> (v128) -; CHECK-NEXT: i16x8.extract_lane_s $push0=, $0, 0 -; CHECK-NEXT: i32.const $push1=, 9 -; CHECK-NEXT: i32.shr_s $push2=, $pop0, $pop1 -; CHECK-NEXT: i16x8.splat $push3=, $pop2 -; CHECK-NEXT: i16x8.extract_lane_s $push4=, $0, 1 -; CHECK-NEXT: i16x8.replace_lane $push5=, $pop3, 1, $pop4 -; ... -; CHECK: i16x8.extract_lane_s $push16=, $0, 7 -; CHECK-NEXT: i16x8.replace_lane $push17=, $pop15, 7, $pop16 -; CHECK-NEXT: v8x16.shuffle $push18=, $pop17, $0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 -; CHECK-NEXT: return $pop18 -define <8 x i16> @shr_s_v8i16(<8 x i16> %in) { - %out = ashr <8 x i16> %in, - %ret = shufflevector <8 x i16> %out, <8 x i16> undef, <8 x i32> zeroinitializer - ret <8 x i16> %ret -} - -; CHECK-LABEL: shr_u_v8i16: -; CHECK-NEXT: functype shr_u_v8i16 (v128) -> (v128) -; CHECK-NEXT: i16x8.extract_lane_u $push0=, $0, 0 -; CHECK-NEXT: i32.const $push1=, 9 -; CHECK-NEXT: i32.shr_u $push2=, $pop0, $pop1 -; CHECK-NEXT: i16x8.splat $push3=, $pop2 -; CHECK-NEXT: i16x8.extract_lane_u $push4=, $0, 1 -; CHECK-NEXT: i16x8.replace_lane $push5=, $pop3, 1, $pop4 -; ... -; CHECK: i16x8.extract_lane_u $push16=, $0, 7 -; CHECK-NEXT: i16x8.replace_lane $push17=, $pop15, 7, $pop16 -; CHECK-NEXT: v8x16.shuffle $push18=, $pop17, $0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 -; CHECK-NEXT: return $pop18 -define <8 x i16> @shr_u_v8i16(<8 x i16> %in) { - %out = lshr <8 x i16> %in, - %ret = shufflevector <8 x i16> %out, <8 x i16> undef, <8 x i32> zeroinitializer - ret <8 x i16> %ret -}