Skip to content

Commit

Permalink
[ARM,MVE] Intrinsics for partial-overwrite imm shifts.
Browse files Browse the repository at this point in the history
This batch of intrinsics covers two sets of immediate shift
instructions, which have in common that they only overwrite part of
their output register and so they need an extra input giving its
previous value.

The VSLI and VSRI instructions shift each lane of the input vector
left or right just as if they were normal immediate VSHL/VSHR, but
then they only overwrite the output bits that correspond to actual
shifted bits of the input. So VSLI will leave the low n bits of each
output lane unchanged, and VSRI the same with the top n bits.

The V[Q][R]SHR[U]N family are all narrowing shifts: they take an input
vector of 2n-bit integers, shift each lane right by a constant, and
then narrowing the shifted result to only n bits. So they only
overwrite half of the n-bit lanes in the output register, and the B/T
suffix indicates whether it's the bottom or top half of each 2n-bit
lane.

I've implemented the whole of the latter family using a single IR
intrinsic `vshrn`, which takes a lot of i32 parameters indicating
which instruction it expands to (by specifying signedness of the input
and output types, whether it saturates and/or rounds, etc).

Reviewers: dmgreen, MarkMurrayARM, miyuki, ostannard

Reviewed By: dmgreen

Subscribers: kristof.beyls, hiraditya, cfe-commits, llvm-commits

Tags: #clang, #llvm

Differential Revision: https://reviews.llvm.org/D72328
  • Loading branch information
statham-arm committed Jan 8, 2020
1 parent ba129c7 commit 3100480
Show file tree
Hide file tree
Showing 7 changed files with 3,032 additions and 58 deletions.
37 changes: 37 additions & 0 deletions clang/include/clang/Basic/arm_mve.td
Expand Up @@ -651,6 +651,43 @@ multiclass vshll_imm<int top> {
defm vshllbq : vshll_imm<0>;
defm vshlltq : vshll_imm<1>;

multiclass DyadicImmShift<Type outtype, Immediate imm, string intname = NAME,
dag extraargs = (?)> {
foreach intparams = [!if(!eq(!cast<string>(outtype), !cast<string>(Vector)),
[Vector], [outtype, Vector])] in {
def q_n: Intrinsic<
outtype, (args outtype:$a, Vector:$b, imm:$sh),
!con((IRInt<intname, intparams> $a, $b, $sh), extraargs)>;

def q_m_n: Intrinsic<
outtype, (args outtype:$a, Vector:$b, imm:$sh, Predicate:$pred),
!con((IRInt<intname # "_predicated", intparams # [Predicate]>
$a, $b, $sh), extraargs, (? $pred))>;
}
}

multiclass VSHRN<Type outtype, Immediate imm, dag extraargs> {
defm b: DyadicImmShift<outtype, imm, "vshrn", !con(extraargs, (? 0))>;
defm t: DyadicImmShift<outtype, imm, "vshrn", !con(extraargs, (? 1))>;
}

let params = [s16, s32, u16, u32], pnt = PNT_NType in {
foreach U = [(unsignedflag Scalar)] in {
defm vshrn : VSHRN<HalfVector, imm_1toHalfN, (? 0,0,U,U)>;
defm vqshrn : VSHRN<HalfVector, imm_1toHalfN, (? 1,0,U,U)>;
defm vrshrn : VSHRN<HalfVector, imm_1toHalfN, (? 0,1,U,U)>;
defm vqrshrn : VSHRN<HalfVector, imm_1toHalfN, (? 1,1,U,U)>;
}
}
let params = [s16, s32], pnt = PNT_NType in {
defm vqshrun : VSHRN<UHalfVector, imm_1toHalfN, (? 1,0,1,0)>;
defm vqrshrun : VSHRN<UHalfVector, imm_1toHalfN, (? 1,0,1,0)>;
}
let params = T.Int, pnt = PNT_NType in {
defm vsli : DyadicImmShift<Vector, imm_1toN>;
defm vsri : DyadicImmShift<Vector, imm_1toN>;
}

// Base class for the scalar shift intrinsics.
class ScalarShift<Type argtype, dag shiftCountArg, dag shiftCodeGen>:
Intrinsic<argtype, !con((args argtype:$value), shiftCountArg), shiftCodeGen> {
Expand Down
24 changes: 19 additions & 5 deletions clang/include/clang/Basic/arm_mve_defs.td
Expand Up @@ -190,7 +190,10 @@ def CTO_Pred: ComplexTypeOp;
class CTO_Tuple<int n_>: ComplexTypeOp { int n = n_; }
class CTO_Pointer<bit const_>: ComplexTypeOp { bit const = const_; }
def CTO_CopyKind: ComplexTypeOp;
def CTO_DoubleSize: ComplexTypeOp;
class CTO_ScaleSize<int num_, int denom_>: ComplexTypeOp {
int num = num_;
int denom = denom_;
}

// -----------------------------------------------------------------------------
// Instances of Type intended to be used directly in the specification of an
Expand Down Expand Up @@ -268,7 +271,8 @@ class CopyKind<Type s, Type k>: ComplexType<(CTO_CopyKind s, k)>;
// DoubleSize<k> expects k to be a scalar type. It returns a scalar type
// whose kind (signed, unsigned or float) matches that of k, and whose size
// is double that of k, if possible.
class DoubleSize<Type k>: ComplexType<(CTO_DoubleSize k)>;
class DoubleSize<Type k> : ComplexType<(CTO_ScaleSize<2, 1> k)>;
class HalfSize<Type k> : ComplexType<(CTO_ScaleSize<1, 2> k)>;

// Unsigned<t> expects t to be a scalar type, and expands to the unsigned
// integer scalar of the same size. So it returns u16 if you give it s16 or
Expand All @@ -280,9 +284,12 @@ class Unsigned<Type t>: ComplexType<(CTO_CopyKind t, u32)>;
def UScalar: Unsigned<Scalar>;
def UVector: VecOf<UScalar>;

// DblVector expands to a vector of scalars of size twice the size of
// Scalar.
// DblVector expands to a vector of scalars of size twice the size of Scalar.
// HalfVector, similarly, expands to a vector of half-sized scalars. And
// UHalfVector is a vector of half-sized _unsigned integers_.
def DblVector: VecOf<DoubleSize<Scalar>>;
def HalfVector: VecOf<HalfSize<Scalar>>;
def UHalfVector: VecOf<Unsigned<HalfSize<Scalar>>>;

// Expands to the 32-bit integer of the same signedness as Scalar.
def Scalar32: CopyKind<u32, Scalar>;
Expand All @@ -305,7 +312,10 @@ class IB_ConstRange<int lo_, int hi_> : ImmediateBounds {
}
def IB_UEltValue : ImmediateBounds;
def IB_LaneIndex : ImmediateBounds;
class IB_EltBit<int base_> : ImmediateBounds { int base = base_; }
class IB_EltBit<int base_, Type type_ = Scalar> : ImmediateBounds {
int base = base_;
Type type = type_;
}

// -----------------------------------------------------------------------------
// End-user definitions for immediate arguments.
Expand All @@ -327,8 +337,12 @@ def imm_simd_vmvn : Immediate<u32, IB_UEltValue> {
//
// imm_0toNm1 is the same but with the range offset by 1, i.e. 0 to N-1
// inclusive.
//
// imm_1toHalfN is like imm_1toN, but applied to a half-width type.
// (So if Scalar is s16, for example, it'll give you the range 1 to 8.)
def imm_1toN : Immediate<sint, IB_EltBit<1>>;
def imm_0toNm1 : Immediate<sint, IB_EltBit<0>>;
def imm_1toHalfN : Immediate<sint, IB_EltBit<1, HalfSize<Scalar>>>;

// imm_lane has to be the index of a vector lane in the main vector type, i.e
// it can range from 0 to (128 / size of scalar)-1 inclusive. (e.g. vgetq_lane)
Expand Down

0 comments on commit 3100480

Please sign in to comment.