diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td index 130602650d34e..49af78bce68c3 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td +++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td @@ -1445,6 +1445,49 @@ def : Pat<(v16i8 (wasm_narrow_u (v8i16 V128:$left), (v8i16 V128:$right))), def : Pat<(v8i16 (wasm_narrow_u (v4i32 V128:$left), (v4i32 V128:$right))), (NARROW_U_I16x8 $left, $right)>; +// Recognize a saturating truncation and convert into the corresponding +// narrow_TYPE_s or narrow_TYPE_u instruction. +multiclass SignedSaturatingTruncate { + def : Pat< + (output (wasm_narrow_u + (and (smin (smax (input V128:$a), (splat_vector (i32 minval))), + (splat_vector (i32 maxval))), (splat_vector (i32 mask))), + (and (smin (smax (input V128:$b), (splat_vector (i32 minval))), + (splat_vector (i32 maxval))), (splat_vector (i32 mask))) + )), + (narrow V128:$a, V128:$b) + >; + + def : Pat< + (output (wasm_narrow_u + (and (smax (smin (input V128:$a), (splat_vector (i32 maxval))), + (splat_vector (i32 minval))), (splat_vector (i32 mask))), + (and (smax (smin (input V128:$b), (splat_vector (i32 maxval))), + (splat_vector (i32 minval))), (splat_vector (i32 mask))) + )), + (narrow V128:$a, V128:$b) + >; +} + +defm : SignedSaturatingTruncate; +defm : SignedSaturatingTruncate; + +multiclass UnsignedSaturatingTruncate { + def : Pat< + (output (wasm_narrow_u + (umin (input V128:$a), (splat_vector (i32 maxval))), + (umin (input V128:$b), (splat_vector (i32 maxval))) + )), + (narrow V128:$a, V128:$b) + >; +} + +defm : UnsignedSaturatingTruncate; +defm : UnsignedSaturatingTruncate; + // Bitcasts are nops // Matching bitcast t1 to t1 causes strange errors, so avoid repeating types foreach t1 = AllVecs in diff --git a/llvm/test/CodeGen/WebAssembly/fpclamptosat_vec.ll b/llvm/test/CodeGen/WebAssembly/fpclamptosat_vec.ll index 52f57dc7d1a3e..a8d37be404cf2 100644 --- a/llvm/test/CodeGen/WebAssembly/fpclamptosat_vec.ll +++ b/llvm/test/CodeGen/WebAssembly/fpclamptosat_vec.ll @@ -434,7 +434,6 @@ entry: define <8 x i16> @stest_f16i16(<8 x half> %x) { ; CHECK-LABEL: stest_f16i16: ; CHECK: .functype stest_f16i16 (f32, f32, f32, f32, f32, f32, f32, f32) -> (v128) -; CHECK-NEXT: .local v128, v128, v128 ; CHECK-NEXT: # %bb.0: # %entry ; CHECK-NEXT: local.get 5 ; CHECK-NEXT: call __truncsfhf2 @@ -474,15 +473,6 @@ define <8 x i16> @stest_f16i16(<8 x half> %x) { ; CHECK-NEXT: call __extendhfsf2 ; CHECK-NEXT: i32.trunc_sat_f32_s ; CHECK-NEXT: i32x4.replace_lane 3 -; CHECK-NEXT: v128.const 32767, 32767, 32767, 32767 -; CHECK-NEXT: local.tee 8 -; CHECK-NEXT: i32x4.min_s -; CHECK-NEXT: v128.const -32768, -32768, -32768, -32768 -; CHECK-NEXT: local.tee 9 -; CHECK-NEXT: i32x4.max_s -; CHECK-NEXT: v128.const 65535, 65535, 65535, 65535 -; CHECK-NEXT: local.tee 10 -; CHECK-NEXT: v128.and ; CHECK-NEXT: local.get 4 ; CHECK-NEXT: i32.trunc_sat_f32_s ; CHECK-NEXT: i32x4.splat @@ -495,13 +485,7 @@ define <8 x i16> @stest_f16i16(<8 x half> %x) { ; CHECK-NEXT: local.get 7 ; CHECK-NEXT: i32.trunc_sat_f32_s ; CHECK-NEXT: i32x4.replace_lane 3 -; CHECK-NEXT: local.get 8 -; CHECK-NEXT: i32x4.min_s -; CHECK-NEXT: local.get 9 -; CHECK-NEXT: i32x4.max_s -; CHECK-NEXT: local.get 10 -; CHECK-NEXT: v128.and -; CHECK-NEXT: i16x8.narrow_i32x4_u +; CHECK-NEXT: i16x8.narrow_i32x4_s ; CHECK-NEXT: # fallthrough-return entry: %conv = fptosi <8 x half> %x to <8 x i32> @@ -516,7 +500,6 @@ entry: define <8 x i16> @utest_f16i16(<8 x half> %x) { ; CHECK-LABEL: utest_f16i16: ; CHECK: .functype utest_f16i16 (f32, f32, f32, f32, f32, f32, f32, f32) -> (v128) -; CHECK-NEXT: .local v128 ; CHECK-NEXT: # %bb.0: # %entry ; CHECK-NEXT: local.get 5 ; CHECK-NEXT: call __truncsfhf2 @@ -556,9 +539,6 @@ define <8 x i16> @utest_f16i16(<8 x half> %x) { ; CHECK-NEXT: call __extendhfsf2 ; CHECK-NEXT: i32.trunc_sat_f32_u ; CHECK-NEXT: i32x4.replace_lane 3 -; CHECK-NEXT: v128.const 65535, 65535, 65535, 65535 -; CHECK-NEXT: local.tee 8 -; CHECK-NEXT: i32x4.min_u ; CHECK-NEXT: local.get 4 ; CHECK-NEXT: i32.trunc_sat_f32_u ; CHECK-NEXT: i32x4.splat @@ -571,8 +551,6 @@ define <8 x i16> @utest_f16i16(<8 x half> %x) { ; CHECK-NEXT: local.get 7 ; CHECK-NEXT: i32.trunc_sat_f32_u ; CHECK-NEXT: i32x4.replace_lane 3 -; CHECK-NEXT: local.get 8 -; CHECK-NEXT: i32x4.min_u ; CHECK-NEXT: i16x8.narrow_i32x4_u ; CHECK-NEXT: # fallthrough-return entry: @@ -1861,7 +1839,6 @@ entry: define <8 x i16> @stest_f16i16_mm(<8 x half> %x) { ; CHECK-LABEL: stest_f16i16_mm: ; CHECK: .functype stest_f16i16_mm (f32, f32, f32, f32, f32, f32, f32, f32) -> (v128) -; CHECK-NEXT: .local v128, v128, v128 ; CHECK-NEXT: # %bb.0: # %entry ; CHECK-NEXT: local.get 5 ; CHECK-NEXT: call __truncsfhf2 @@ -1901,15 +1878,6 @@ define <8 x i16> @stest_f16i16_mm(<8 x half> %x) { ; CHECK-NEXT: call __extendhfsf2 ; CHECK-NEXT: i32.trunc_sat_f32_s ; CHECK-NEXT: i32x4.replace_lane 3 -; CHECK-NEXT: v128.const 32767, 32767, 32767, 32767 -; CHECK-NEXT: local.tee 8 -; CHECK-NEXT: i32x4.min_s -; CHECK-NEXT: v128.const -32768, -32768, -32768, -32768 -; CHECK-NEXT: local.tee 9 -; CHECK-NEXT: i32x4.max_s -; CHECK-NEXT: v128.const 65535, 65535, 65535, 65535 -; CHECK-NEXT: local.tee 10 -; CHECK-NEXT: v128.and ; CHECK-NEXT: local.get 4 ; CHECK-NEXT: i32.trunc_sat_f32_s ; CHECK-NEXT: i32x4.splat @@ -1922,13 +1890,7 @@ define <8 x i16> @stest_f16i16_mm(<8 x half> %x) { ; CHECK-NEXT: local.get 7 ; CHECK-NEXT: i32.trunc_sat_f32_s ; CHECK-NEXT: i32x4.replace_lane 3 -; CHECK-NEXT: local.get 8 -; CHECK-NEXT: i32x4.min_s -; CHECK-NEXT: local.get 9 -; CHECK-NEXT: i32x4.max_s -; CHECK-NEXT: local.get 10 -; CHECK-NEXT: v128.and -; CHECK-NEXT: i16x8.narrow_i32x4_u +; CHECK-NEXT: i16x8.narrow_i32x4_s ; CHECK-NEXT: # fallthrough-return entry: %conv = fptosi <8 x half> %x to <8 x i32> @@ -1941,7 +1903,6 @@ entry: define <8 x i16> @utest_f16i16_mm(<8 x half> %x) { ; CHECK-LABEL: utest_f16i16_mm: ; CHECK: .functype utest_f16i16_mm (f32, f32, f32, f32, f32, f32, f32, f32) -> (v128) -; CHECK-NEXT: .local v128 ; CHECK-NEXT: # %bb.0: # %entry ; CHECK-NEXT: local.get 5 ; CHECK-NEXT: call __truncsfhf2 @@ -1981,9 +1942,6 @@ define <8 x i16> @utest_f16i16_mm(<8 x half> %x) { ; CHECK-NEXT: call __extendhfsf2 ; CHECK-NEXT: i32.trunc_sat_f32_u ; CHECK-NEXT: i32x4.replace_lane 3 -; CHECK-NEXT: v128.const 65535, 65535, 65535, 65535 -; CHECK-NEXT: local.tee 8 -; CHECK-NEXT: i32x4.min_u ; CHECK-NEXT: local.get 4 ; CHECK-NEXT: i32.trunc_sat_f32_u ; CHECK-NEXT: i32x4.splat @@ -1996,8 +1954,6 @@ define <8 x i16> @utest_f16i16_mm(<8 x half> %x) { ; CHECK-NEXT: local.get 7 ; CHECK-NEXT: i32.trunc_sat_f32_u ; CHECK-NEXT: i32x4.replace_lane 3 -; CHECK-NEXT: local.get 8 -; CHECK-NEXT: i32x4.min_u ; CHECK-NEXT: i16x8.narrow_i32x4_u ; CHECK-NEXT: # fallthrough-return entry: diff --git a/llvm/test/CodeGen/WebAssembly/saturating-truncation.ll b/llvm/test/CodeGen/WebAssembly/saturating-truncation.ll new file mode 100644 index 0000000000000..f3f3ba9b268d7 --- /dev/null +++ b/llvm/test/CodeGen/WebAssembly/saturating-truncation.ll @@ -0,0 +1,87 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 + +; RUN: llc < %s -verify-machineinstrs -mattr=+simd128 | FileCheck %s + +target triple = "wasm32-unknown-unknown" + +declare <8 x i32> @llvm.smin.v8i32(<8 x i32>, <8 x i32>) #2 +declare <8 x i32> @llvm.smax.v8i32(<8 x i32>, <8 x i32>) #2 + +define <16 x i8> @i16_signed(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: i16_signed: +; CHECK: .functype i16_signed (v128, v128) -> (v128) +; CHECK-NEXT: # %bb.0: # %bb2 +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i8x16.narrow_i16x8_s +; CHECK-NEXT: # fallthrough-return +bb2: + %0 = shufflevector <8 x i16> %a, <8 x i16> %b, <16 x i32> + %1 = tail call <16 x i16> @llvm.smax.v16i16(<16 x i16> %0, <16 x i16> splat (i16 -128)) + %2 = tail call <16 x i16> @llvm.smin.v16i16(<16 x i16> %1, <16 x i16> splat (i16 127)) + %3 = trunc nsw <16 x i16> %2 to <16 x i8> + ret <16 x i8> %3 + ret <16 x i8> %3 +} + +define <8 x i16> @i32_signed(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: i32_signed: +; CHECK: .functype i32_signed (v128, v128) -> (v128) +; CHECK-NEXT: # %bb.0: # %bb2 +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i16x8.narrow_i32x4_s +; CHECK-NEXT: # fallthrough-return +bb2: + %0 = shufflevector <4 x i32> %a, <4 x i32> %b, <8 x i32> + %1 = tail call <8 x i32> @llvm.smax.v8i32(<8 x i32> %0, <8 x i32> splat (i32 -32768)) + %2 = tail call <8 x i32> @llvm.smin.v8i32(<8 x i32> %1, <8 x i32> splat (i32 32767)) + %3 = trunc nsw <8 x i32> %2 to <8 x i16> + ret <8 x i16> %3 +} + +define <8 x i16> @i32_signed_flipped(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: i32_signed_flipped: +; CHECK: .functype i32_signed_flipped (v128, v128) -> (v128) +; CHECK-NEXT: # %bb.0: # %bb2 +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i16x8.narrow_i32x4_s +; CHECK-NEXT: # fallthrough-return +bb2: + %0 = shufflevector <4 x i32> %a, <4 x i32> %b, <8 x i32> + %1 = tail call <8 x i32> @llvm.smin.v8i32(<8 x i32> splat (i32 32767), <8 x i32> %0) + %2 = tail call <8 x i32> @llvm.smax.v8i32(<8 x i32> splat (i32 -32768), <8 x i32> %1) + %3 = trunc nsw <8 x i32> %2 to <8 x i16> + ret <8 x i16> %3 +} + +define <16 x i8> @i16_unsigned(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: i16_unsigned: +; CHECK: .functype i16_unsigned (v128, v128) -> (v128) +; CHECK-NEXT: # %bb.0: # %bb2 +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i8x16.narrow_i16x8_u +; CHECK-NEXT: # fallthrough-return +bb2: + %0 = shufflevector <8 x i16> %a, <8 x i16> %b, <16 x i32> + %1 = tail call <16 x i16> @llvm.umin.v16i16(<16 x i16> %0, <16 x i16> splat (i16 255)) + %2 = trunc nuw <16 x i16> %1 to <16 x i8> + ret <16 x i8> %2 +} + +define <8 x i16> @i32_unsigned(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: i32_unsigned: +; CHECK: .functype i32_unsigned (v128, v128) -> (v128) +; CHECK-NEXT: # %bb.0: # %bb2 +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i16x8.narrow_i32x4_u +; CHECK-NEXT: # fallthrough-return +bb2: + %0 = shufflevector <4 x i32> %a, <4 x i32> %b, <8 x i32> + %1 = tail call <8 x i32> @llvm.umin.v8i32(<8 x i32> %0, <8 x i32> splat (i32 65535)) + %2 = trunc nsw <8 x i32> %1 to <8 x i16> + ret <8 x i16> %2 +}