[ARM,MVE] Intrinsics for partial-overwrite imm shifts.

This batch of intrinsics covers two sets of immediate shift instructions, which have in common that they only overwrite part of their output register and so they need an extra input giving its previous value. The VSLI and VSRI instructions shift each lane of the input vector left or right just as if they were normal immediate VSHL/VSHR, but then they only overwrite the output bits that correspond to actual shifted bits of the input. So VSLI will leave the low n bits of each output lane unchanged, and VSRI the same with the top n bits. The V[Q][R]SHR[U]N family are all narrowing shifts: they take an input vector of 2n-bit integers, shift each lane right by a constant, and then narrowing the shifted result to only n bits. So they only overwrite half of the n-bit lanes in the output register, and the B/T suffix indicates whether it's the bottom or top half of each 2n-bit lane. I've implemented the whole of the latter family using a single IR intrinsic `vshrn`, which takes a lot of i32 parameters indicating which instruction it expands to (by specifying signedness of the input and output types, whether it saturates and/or rounds, etc). Reviewers: dmgreen, MarkMurrayARM, miyuki, ostannard Reviewed By: dmgreen Subscribers: kristof.beyls, hiraditya, cfe-commits, llvm-commits Tags: #clang, #llvm Differential Revision: https://reviews.llvm.org/D72328
llvm · Jan 8, 2020 · 3100480 · 3100480
1 parent ba129c7
commit 3100480
Show file tree

Hide file tree

Showing 7 changed files with 3,032 additions and 58 deletions.
diff --git a/clang/include/clang/Basic/arm_mve.td b/clang/include/clang/Basic/arm_mve.td
@@ -651,6 +651,43 @@ multiclass vshll_imm<int top> {
 defm vshllbq : vshll_imm<0>;
 defm vshlltq : vshll_imm<1>;
 
+multiclass DyadicImmShift<Type outtype, Immediate imm, string intname = NAME,
+                          dag extraargs = (?)> {
+  foreach intparams = [!if(!eq(!cast<string>(outtype), !cast<string>(Vector)),
+                           [Vector], [outtype, Vector])] in {
+    def q_n: Intrinsic<
+        outtype, (args outtype:$a, Vector:$b, imm:$sh),
+        !con((IRInt<intname, intparams> $a, $b, $sh), extraargs)>;
+
+    def q_m_n: Intrinsic<
+        outtype, (args outtype:$a, Vector:$b, imm:$sh, Predicate:$pred),
+        !con((IRInt<intname # "_predicated", intparams # [Predicate]>
+                 $a, $b, $sh), extraargs, (? $pred))>;
+  }
+}
+
+multiclass VSHRN<Type outtype, Immediate imm, dag extraargs> {
+  defm b: DyadicImmShift<outtype, imm, "vshrn", !con(extraargs, (? 0))>;
+  defm t: DyadicImmShift<outtype, imm, "vshrn", !con(extraargs, (? 1))>;
+}
+
+let params = [s16, s32, u16, u32], pnt = PNT_NType in {
+  foreach U = [(unsignedflag Scalar)] in {
+    defm vshrn   : VSHRN<HalfVector, imm_1toHalfN, (? 0,0,U,U)>;
+    defm vqshrn  : VSHRN<HalfVector, imm_1toHalfN, (? 1,0,U,U)>;
+    defm vrshrn  : VSHRN<HalfVector, imm_1toHalfN, (? 0,1,U,U)>;
+    defm vqrshrn : VSHRN<HalfVector, imm_1toHalfN, (? 1,1,U,U)>;
+  }
+}
+let params = [s16, s32], pnt = PNT_NType in {
+  defm vqshrun  : VSHRN<UHalfVector, imm_1toHalfN, (? 1,0,1,0)>;
+  defm vqrshrun : VSHRN<UHalfVector, imm_1toHalfN, (? 1,0,1,0)>;
+}
+let params = T.Int, pnt = PNT_NType in {
+  defm vsli : DyadicImmShift<Vector, imm_1toN>;
+  defm vsri : DyadicImmShift<Vector, imm_1toN>;
+}
+
 // Base class for the scalar shift intrinsics.
 class ScalarShift<Type argtype, dag shiftCountArg, dag shiftCodeGen>:
   Intrinsic<argtype, !con((args argtype:$value), shiftCountArg), shiftCodeGen> {

diff --git a/clang/include/clang/Basic/arm_mve_defs.td b/clang/include/clang/Basic/arm_mve_defs.td
@@ -190,7 +190,10 @@ def CTO_Pred: ComplexTypeOp;
 class CTO_Tuple<int n_>: ComplexTypeOp { int n = n_; }
 class CTO_Pointer<bit const_>: ComplexTypeOp { bit const = const_; }
 def CTO_CopyKind: ComplexTypeOp;
-def CTO_DoubleSize: ComplexTypeOp;
+class CTO_ScaleSize<int num_, int denom_>: ComplexTypeOp {
+  int num = num_;
+  int denom = denom_;
+}
 
 // -----------------------------------------------------------------------------
 // Instances of Type intended to be used directly in the specification of an
@@ -268,7 +271,8 @@ class CopyKind<Type s, Type k>: ComplexType<(CTO_CopyKind s, k)>;
 // DoubleSize<k> expects k to be a scalar type. It returns a scalar type
 // whose kind (signed, unsigned or float) matches that of k, and whose size
 // is double that of k, if possible.
-class DoubleSize<Type k>: ComplexType<(CTO_DoubleSize k)>;
+class DoubleSize<Type k> : ComplexType<(CTO_ScaleSize<2, 1> k)>;
+class HalfSize<Type k>   : ComplexType<(CTO_ScaleSize<1, 2> k)>;
 
 // Unsigned<t> expects t to be a scalar type, and expands to the unsigned
 // integer scalar of the same size. So it returns u16 if you give it s16 or
@@ -280,9 +284,12 @@ class Unsigned<Type t>: ComplexType<(CTO_CopyKind t, u32)>;
 def UScalar: Unsigned<Scalar>;
 def UVector: VecOf<UScalar>;
 
-// DblVector expands to a vector of scalars of size twice the size of
-// Scalar.
+// DblVector expands to a vector of scalars of size twice the size of Scalar.
+// HalfVector, similarly, expands to a vector of half-sized scalars. And
+// UHalfVector is a vector of half-sized _unsigned integers_.
 def DblVector: VecOf<DoubleSize<Scalar>>;
+def HalfVector: VecOf<HalfSize<Scalar>>;
+def UHalfVector: VecOf<Unsigned<HalfSize<Scalar>>>;
 
 // Expands to the 32-bit integer of the same signedness as Scalar.
 def Scalar32: CopyKind<u32, Scalar>;
@@ -305,7 +312,10 @@ class IB_ConstRange<int lo_, int hi_> : ImmediateBounds {
 }
 def IB_UEltValue : ImmediateBounds;
 def IB_LaneIndex : ImmediateBounds;
-class IB_EltBit<int base_> : ImmediateBounds { int base = base_; }
+class IB_EltBit<int base_, Type type_ = Scalar> : ImmediateBounds {
+  int base = base_;
+  Type type = type_;
+}
 
 // -----------------------------------------------------------------------------
 // End-user definitions for immediate arguments.
@@ -327,8 +337,12 @@ def imm_simd_vmvn : Immediate<u32, IB_UEltValue> {
 //
 // imm_0toNm1 is the same but with the range offset by 1, i.e. 0 to N-1
 // inclusive.
+//
+// imm_1toHalfN is like imm_1toN, but applied to a half-width type.
+// (So if Scalar is s16, for example, it'll give you the range 1 to 8.)
 def imm_1toN : Immediate<sint, IB_EltBit<1>>;
 def imm_0toNm1 : Immediate<sint, IB_EltBit<0>>;
+def imm_1toHalfN : Immediate<sint, IB_EltBit<1, HalfSize<Scalar>>>;
 
 // imm_lane has to be the index of a vector lane in the main vector type, i.e
 // it can range from 0 to (128 / size of scalar)-1 inclusive. (e.g. vgetq_lane)