[WebAssembly][NFC] Simplify vector shift lowering and add tests

This patch builds on 0d7286a by simplifying the code for detecting splat values and adding new tests demonstrating the lowering of splatted absolute value shift amounts, which are common in code generated by Halide. The lowering is very bad right now, but subsequent patches will improve it considerably. The tests will be useful for evaluating the improvements in those patches. Reviewed By: aheejin Differential Revision: https://reviews.llvm.org/D83493
llvm · Jul 10, 2020 · 043eaa9 · 043eaa9
1 parent eb6b7c5
commit 043eaa9
Show file tree

Hide file tree

Showing 2 changed files with 81 additions and 8 deletions.
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -1677,12 +1677,12 @@ SDValue WebAssemblyTargetLowering::LowerShift(SDValue Op,
   // Only manually lower vector shifts
   assert(Op.getSimpleValueType().isVector());
 
-  auto ShiftVal = Op.getOperand(1);
-  if (!DAG.isSplatValue(ShiftVal, /*AllowUndefs=*/true))
+  auto ShiftVal = DAG.getSplatValue(Op.getOperand(1));
+  if (!ShiftVal)
     return unrollVectorShift(Op, DAG);
 
-  auto SplatVal = DAG.getSplatValue(ShiftVal);
-  assert(SplatVal != SDValue());
+  // Use anyext because none of the high bits can affect the shift
+  ShiftVal = DAG.getAnyExtOrTrunc(ShiftVal, DL, MVT::i32);
 
   unsigned Opcode;
   switch (Op.getOpcode()) {
@@ -1699,10 +1699,7 @@ SDValue WebAssemblyTargetLowering::LowerShift(SDValue Op,
     llvm_unreachable("unexpected opcode");
   }
 
-  // Use anyext because none of the high bits can affect the shift
-  auto ScalarShift = DAG.getAnyExtOrTrunc(SplatVal, DL, MVT::i32);
-  return DAG.getNode(Opcode, DL, Op.getValueType(), Op.getOperand(0),
-                     ScalarShift);
+  return DAG.getNode(Opcode, DL, Op.getValueType(), Op.getOperand(0), ShiftVal);
 }
 
 //===----------------------------------------------------------------------===//

diff --git a/llvm/test/CodeGen/WebAssembly/simd-shift-complex-splats.ll b/llvm/test/CodeGen/WebAssembly/simd-shift-complex-splats.ll
@@ -25,3 +25,79 @@ define <16 x i8> @shl_add(<16 x i8> %v, i8 %a, i8 %b) {
   %r = shl <16 x i8> %v, %shift
   ret <16 x i8> %r
 }
+
+; CHECK-LABEL: shl_abs:
+; CHECK-NEXT: .functype shl_abs (v128, i32) -> (v128)
+; CHECK-NEXT: i8x16.extract_lane_u $push8=, $0, 0
+; CHECK-NEXT: i8x16.splat $push0=, $1
+; CHECK-NEXT: i8x16.abs $push98=, $pop0
+; CHECK-NEXT: local.tee $push97=, $2=, $pop98
+; CHECK-NEXT: i8x16.extract_lane_u $push6=, $pop97, 0
+; CHECK-NEXT: i32.const $push2=, 7
+; CHECK-NEXT: i32.and $push7=, $pop6, $pop2
+; CHECK-NEXT: i32.shl $push9=, $pop8, $pop7
+; CHECK-NEXT: i8x16.splat $push10=, $pop9
+; CHECK-NEXT: i8x16.extract_lane_u $push4=, $0, 1
+; CHECK-NEXT: i8x16.extract_lane_u $push1=, $2, 1
+; CHECK-NEXT: i32.const $push96=, 7
+; CHECK-NEXT: i32.and $push3=, $pop1, $pop96
+; CHECK-NEXT: i32.shl $push5=, $pop4, $pop3
+; CHECK-NEXT: i8x16.replace_lane $push11=, $pop10, 1, $pop5
+; ...
+; CHECK:      i8x16.extract_lane_u $push79=, $0, 15
+; CHECK-NEXT: i8x16.extract_lane_u $push77=, $2, 15
+; CHECK-NEXT: i32.const $push82=, 7
+; CHECK-NEXT: i32.and $push78=, $pop77, $pop82
+; CHECK-NEXT: i32.shl $push80=, $pop79, $pop78
+; CHECK-NEXT: i8x16.replace_lane $push81=, $pop76, 15, $pop80
+; CHECK-NEXT: return $pop81
+define <16 x i8> @shl_abs(<16 x i8> %v, i8 %a) {
+  %t1 = insertelement <16 x i8> undef, i8 %a, i32 0
+  %va = shufflevector <16 x i8> %t1, <16 x i8> undef, <16 x i32> zeroinitializer
+  %nva = sub <16 x i8> zeroinitializer, %va
+  %c = icmp sgt <16 x i8> %va, zeroinitializer
+  %shift = select <16 x i1> %c, <16 x i8> %va, <16 x i8> %nva
+  %r = shl <16 x i8> %v, %shift
+  ret <16 x i8> %r
+}
+
+; CHECK-LABEL: shl_abs_add:
+; CHECK-NEXT: .functype shl_abs_add (v128, i32, i32) -> (v128)
+; CHECK-NEXT: i8x16.extract_lane_u $push11=, $0, 0
+; CHECK-NEXT: i8x16.splat $push1=, $1
+; CHECK-NEXT: i8x16.splat $push0=, $2
+; CHECK-NEXT: i8x16.add $push2=, $pop1, $pop0
+; CHECK-NEXT: v8x16.shuffle $push3=, $pop2, $0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK-NEXT: i8x16.abs $push101=, $pop3
+; CHECK-NEXT: local.tee $push100=, $3=, $pop101
+; CHECK-NEXT: i8x16.extract_lane_u $push9=, $pop100, 0
+; CHECK-NEXT: i32.const $push5=, 7
+; CHECK-NEXT: i32.and $push10=, $pop9, $pop5
+; CHECK-NEXT: i32.shl $push12=, $pop11, $pop10
+; CHECK-NEXT: i8x16.splat $push13=, $pop12
+; CHECK-NEXT: i8x16.extract_lane_u $push7=, $0, 1
+; CHECK-NEXT: i8x16.extract_lane_u $push4=, $3, 1
+; CHECK-NEXT: i32.const $push99=, 7
+; CHECK-NEXT: i32.and $push6=, $pop4, $pop99
+; CHECK-NEXT: i32.shl $push8=, $pop7, $pop6
+; CHECK-NEXT: i8x16.replace_lane $push14=, $pop13, 1, $pop8
+; ...
+; CHECK:      i8x16.extract_lane_u $push82=, $0, 15
+; CHECK-NEXT: i8x16.extract_lane_u $push80=, $3, 15
+; CHECK-NEXT: i32.const $push85=, 7
+; CHECK-NEXT: i32.and $push81=, $pop80, $pop85
+; CHECK-NEXT: i32.shl $push83=, $pop82, $pop81
+; CHECK-NEXT: i8x16.replace_lane $push84=, $pop79, 15, $pop83
+; CHECK-NEXT: return $pop84
+define <16 x i8> @shl_abs_add(<16 x i8> %v, i8 %a, i8 %b) {
+  %t1 = insertelement <16 x i8> undef, i8 %a, i32 0
+  %va = shufflevector <16 x i8> %t1, <16 x i8> undef, <16 x i32> zeroinitializer
+  %t2 = insertelement <16 x i8> undef, i8 %b, i32 0
+  %vb = shufflevector <16 x i8> %t2, <16 x i8> undef, <16 x i32> zeroinitializer
+  %vadd = add <16 x i8> %va, %vb
+  %nvadd = sub <16 x i8> zeroinitializer, %vadd
+  %c = icmp sgt <16 x i8> %vadd, zeroinitializer
+  %shift = select <16 x i1> %c, <16 x i8> %vadd, <16 x i8> %nvadd
+  %r = shl <16 x i8> %v, %shift
+  ret <16 x i8> %r
+}