Skip to content

Commit

Permalink
[RISCV] Use v(f)slide1up for shuffle+insert idiom
Browse files Browse the repository at this point in the history
This is pretty straight forward in the basic form. I did need to move the slideup matching earlier, but that looks generally profitable on it's own.

As follow ups, I plan to explore the v(f)slide1down variants, and see what I can do to canonicalize the shuffle then insert pattern (see _inverse tests at the end of the vslide1up.ll test).

Differential Revision: https://reviews.llvm.org/D151468
  • Loading branch information
preames committed May 30, 2023
1 parent 891fad0 commit 544a240
Show file tree
Hide file tree
Showing 3 changed files with 74 additions and 68 deletions.
22 changes: 18 additions & 4 deletions llvm/lib/Target/RISCV/RISCVISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3731,6 +3731,20 @@ static SDValue lowerVECTOR_SHUFFLEAsVSlideup(const SDLoc &DL, MVT VT,
MVT XLenVT = Subtarget.getXLenVT();
MVT ContainerVT = getContainerForFixedLengthVector(DAG, VT, Subtarget);
auto TrueMask = getDefaultVLOps(VT, ContainerVT, DL, DAG, Subtarget).first;
if (Index == 1 && NumSubElts + Index == (int)NumElts &&
isa<BuildVectorSDNode>(InPlace)) {
if (SDValue Splat = cast<BuildVectorSDNode>(InPlace)->getSplatValue()) {
auto OpCode =
VT.isFloatingPoint() ? RISCVISD::VFSLIDE1UP_VL : RISCVISD::VSLIDE1UP_VL;
auto Vec = DAG.getNode(OpCode, DL, ContainerVT,
DAG.getUNDEF(ContainerVT),
convertToScalableVector(ContainerVT, ToInsert, DAG, Subtarget),
Splat, TrueMask,
DAG.getConstant(NumSubElts + Index, DL, XLenVT));
return convertFromScalableVector(VT, Vec, DAG, Subtarget);
}
}

// We slide up by the index that the subvector is being inserted at, and set
// VL to the index + the number of elements being inserted.
unsigned Policy = RISCVII::TAIL_UNDISTURBED_MASK_UNDISTURBED | RISCVII::MASK_AGNOSTIC;
Expand Down Expand Up @@ -3967,6 +3981,10 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
Subtarget, DAG);
}

if (SDValue V =
lowerVECTOR_SHUFFLEAsVSlideup(DL, VT, V1, V2, Mask, Subtarget, DAG))
return V;

// Detect an interleave shuffle and lower to
// (vmaccu.vx (vwaddu.vx lohalf(V1), lohalf(V2)), lohalf(V2), (2^eltbits - 1))
int EvenSrc, OddSrc;
Expand All @@ -3989,10 +4007,6 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
return getWideningInterleave(EvenV, OddV, DL, DAG, Subtarget);
}

if (SDValue V =
lowerVECTOR_SHUFFLEAsVSlideup(DL, VT, V1, V2, Mask, Subtarget, DAG))
return V;

// Detect shuffles which can be re-expressed as vector selects; these are
// shuffles in which each element in the destination is taken from an element
// at the corresponding index in either source vectors.
Expand Down
14 changes: 4 additions & 10 deletions llvm/test/CodeGen/RISCV/rvv/fixed-vector-shuffle-transpose.ll
Original file line number Diff line number Diff line change
Expand Up @@ -171,11 +171,8 @@ define <8 x i16> @trn2.v8i16(<8 x i16> %v0, <8 x i16> %v1) {
define <2 x i32> @trn1.v2i32(<2 x i32> %v0, <2 x i32> %v1) {
; CHECK-LABEL: trn1.v2i32:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma
; CHECK-NEXT: vwaddu.vv v10, v8, v9
; CHECK-NEXT: li a0, -1
; CHECK-NEXT: vwmaccu.vx v10, a0, v9
; CHECK-NEXT: vmv1r.v v8, v10
; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
; CHECK-NEXT: vslideup.vi v8, v9, 1
; CHECK-NEXT: ret
%tmp0 = shufflevector <2 x i32> %v0, <2 x i32> %v1, <2 x i32> <i32 0, i32 2>
ret <2 x i32> %tmp0
Expand Down Expand Up @@ -256,11 +253,8 @@ define <2 x i64> @trn2.v2i64(<2 x i64> %v0, <2 x i64> %v1) {
define <2 x float> @trn1.v2f32(<2 x float> %v0, <2 x float> %v1) {
; CHECK-LABEL: trn1.v2f32:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma
; CHECK-NEXT: vwaddu.vv v10, v8, v9
; CHECK-NEXT: li a0, -1
; CHECK-NEXT: vwmaccu.vx v10, a0, v9
; CHECK-NEXT: vmv1r.v v8, v10
; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
; CHECK-NEXT: vslideup.vi v8, v9, 1
; CHECK-NEXT: ret
%tmp0 = shufflevector <2 x float> %v0, <2 x float> %v1, <2 x i32> <i32 0, i32 2>
ret <2 x float> %tmp0
Expand Down
106 changes: 52 additions & 54 deletions llvm/test/CodeGen/RISCV/rvv/fixed-vector-shuffle-vslide1up.ll
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,7 @@ define <2 x i8> @vslide1up_2xi8(<2 x i8> %v, i8 %b) {
; CHECK-LABEL: vslide1up_2xi8:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma
; CHECK-NEXT: vmv.v.x v10, a0
; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
; CHECK-NEXT: vwaddu.vv v9, v10, v8
; CHECK-NEXT: li a0, -1
; CHECK-NEXT: vwmaccu.vx v9, a0, v8
; CHECK-NEXT: vslide1up.vx v9, v8, a0
; CHECK-NEXT: vmv1r.v v8, v9
; CHECK-NEXT: ret
%vb = insertelement <2 x i8> poison, i8 %b, i64 0
Expand All @@ -33,8 +29,7 @@ define <4 x i8> @vslide1up_4xi8(<4 x i8> %v, i8 %b) {
; RV64-LABEL: vslide1up_4xi8:
; RV64: # %bb.0:
; RV64-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
; RV64-NEXT: vmv.v.x v9, a0
; RV64-NEXT: vslideup.vi v9, v8, 1
; RV64-NEXT: vslide1up.vx v9, v8, a0
; RV64-NEXT: vmv1r.v v8, v9
; RV64-NEXT: ret
%vb = insertelement <4 x i8> poison, i8 %b, i64 0
Expand All @@ -55,8 +50,7 @@ define <4 x i8> @vslide1up_4xi8_swapped(<4 x i8> %v, i8 %b) {
; RV64-LABEL: vslide1up_4xi8_swapped:
; RV64: # %bb.0:
; RV64-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
; RV64-NEXT: vmv.v.x v9, a0
; RV64-NEXT: vslideup.vi v9, v8, 1
; RV64-NEXT: vslide1up.vx v9, v8, a0
; RV64-NEXT: vmv1r.v v8, v9
; RV64-NEXT: ret
%vb = insertelement <4 x i8> poison, i8 %b, i64 0
Expand All @@ -68,22 +62,16 @@ define <2 x i16> @vslide1up_2xi16(<2 x i16> %v, i16 %b) {
; RV32-LABEL: vslide1up_2xi16:
; RV32: # %bb.0:
; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma
; RV32-NEXT: vmv.s.x v10, a0
; RV32-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
; RV32-NEXT: vwaddu.vv v9, v10, v8
; RV32-NEXT: li a0, -1
; RV32-NEXT: vwmaccu.vx v9, a0, v8
; RV32-NEXT: vmv.s.x v9, a0
; RV32-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
; RV32-NEXT: vslideup.vi v9, v8, 1
; RV32-NEXT: vmv1r.v v8, v9
; RV32-NEXT: ret
;
; RV64-LABEL: vslide1up_2xi16:
; RV64: # %bb.0:
; RV64-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
; RV64-NEXT: vmv.v.x v10, a0
; RV64-NEXT: vsetivli zero, 1, e16, mf4, ta, ma
; RV64-NEXT: vwaddu.vv v9, v10, v8
; RV64-NEXT: li a0, -1
; RV64-NEXT: vwmaccu.vx v9, a0, v8
; RV64-NEXT: vslide1up.vx v9, v8, a0
; RV64-NEXT: vmv1r.v v8, v9
; RV64-NEXT: ret
%vb = insertelement <2 x i16> poison, i16 %b, i64 0
Expand All @@ -95,8 +83,7 @@ define <4 x i16> @vslide1up_4xi16(<4 x i16> %v, i16 %b) {
; RV32-LABEL: vslide1up_4xi16:
; RV32: # %bb.0:
; RV32-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
; RV32-NEXT: vmv.v.x v9, a0
; RV32-NEXT: vslideup.vi v9, v8, 1
; RV32-NEXT: vslide1up.vx v9, v8, a0
; RV32-NEXT: vmv1r.v v8, v9
; RV32-NEXT: ret
;
Expand All @@ -117,22 +104,16 @@ define <2 x i32> @vslide1up_2xi32(<2 x i32> %v, i32 %b) {
; RV32-LABEL: vslide1up_2xi32:
; RV32: # %bb.0:
; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
; RV32-NEXT: vmv.v.x v10, a0
; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma
; RV32-NEXT: vwaddu.vv v9, v10, v8
; RV32-NEXT: li a0, -1
; RV32-NEXT: vwmaccu.vx v9, a0, v8
; RV32-NEXT: vslide1up.vx v9, v8, a0
; RV32-NEXT: vmv1r.v v8, v9
; RV32-NEXT: ret
;
; RV64-LABEL: vslide1up_2xi32:
; RV64: # %bb.0:
; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma
; RV64-NEXT: vmv.s.x v10, a0
; RV64-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
; RV64-NEXT: vwaddu.vv v9, v10, v8
; RV64-NEXT: li a0, -1
; RV64-NEXT: vwmaccu.vx v9, a0, v8
; RV64-NEXT: vmv.s.x v9, a0
; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
; RV64-NEXT: vslideup.vi v9, v8, 1
; RV64-NEXT: vmv1r.v v8, v9
; RV64-NEXT: ret
%vb = insertelement <2 x i32> poison, i32 %b, i64 0
Expand All @@ -144,8 +125,7 @@ define <4 x i32> @vslide1up_4xi32(<4 x i32> %v, i32 %b) {
; CHECK-LABEL: vslide1up_4xi32:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
; CHECK-NEXT: vmv.v.x v9, a0
; CHECK-NEXT: vslideup.vi v9, v8, 1
; CHECK-NEXT: vslide1up.vx v9, v8, a0
; CHECK-NEXT: vmv.v.v v8, v9
; CHECK-NEXT: ret
%vb = insertelement <4 x i32> poison, i32 %b, i64 0
Expand All @@ -171,8 +151,7 @@ define <2 x i64> @vslide1up_2xi64(<2 x i64> %v, i64 %b) {
; RV64-LABEL: vslide1up_2xi64:
; RV64: # %bb.0:
; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma
; RV64-NEXT: vmv.v.x v9, a0
; RV64-NEXT: vslideup.vi v9, v8, 1
; RV64-NEXT: vslide1up.vx v9, v8, a0
; RV64-NEXT: vmv.v.v v8, v9
; RV64-NEXT: ret
%vb = insertelement <2 x i64> poison, i64 %b, i64 0
Expand All @@ -198,8 +177,7 @@ define <4 x i64> @vslide1up_4xi64(<4 x i64> %v, i64 %b) {
; RV64-LABEL: vslide1up_4xi64:
; RV64: # %bb.0:
; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma
; RV64-NEXT: vmv.v.x v10, a0
; RV64-NEXT: vslideup.vi v10, v8, 1
; RV64-NEXT: vslide1up.vx v10, v8, a0
; RV64-NEXT: vmv.v.v v8, v10
; RV64-NEXT: ret
%vb = insertelement <4 x i64> poison, i64 %b, i64 0
Expand All @@ -211,11 +189,7 @@ define <2 x half> @vslide1up_2xf16(<2 x half> %v, half %b) {
; CHECK-LABEL: vslide1up_2xf16:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
; CHECK-NEXT: vfmv.v.f v10, fa0
; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, ma
; CHECK-NEXT: vwaddu.vv v9, v10, v8
; CHECK-NEXT: li a0, -1
; CHECK-NEXT: vwmaccu.vx v9, a0, v8
; CHECK-NEXT: vfslide1up.vf v9, v8, fa0
; CHECK-NEXT: vmv1r.v v8, v9
; CHECK-NEXT: ret
%vb = insertelement <2 x half> poison, half %b, i64 0
Expand All @@ -227,8 +201,7 @@ define <4 x half> @vslide1up_4xf16(<4 x half> %v, half %b) {
; CHECK-LABEL: vslide1up_4xf16:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
; CHECK-NEXT: vfmv.v.f v9, fa0
; CHECK-NEXT: vslideup.vi v9, v8, 1
; CHECK-NEXT: vfslide1up.vf v9, v8, fa0
; CHECK-NEXT: vmv1r.v v8, v9
; CHECK-NEXT: ret
%vb = insertelement <4 x half> poison, half %b, i64 0
Expand All @@ -240,11 +213,7 @@ define <2 x float> @vslide1up_2xf32(<2 x float> %v, float %b) {
; CHECK-LABEL: vslide1up_2xf32:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
; CHECK-NEXT: vfmv.v.f v10, fa0
; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma
; CHECK-NEXT: vwaddu.vv v9, v10, v8
; CHECK-NEXT: li a0, -1
; CHECK-NEXT: vwmaccu.vx v9, a0, v8
; CHECK-NEXT: vfslide1up.vf v9, v8, fa0
; CHECK-NEXT: vmv1r.v v8, v9
; CHECK-NEXT: ret
%vb = insertelement <2 x float> poison, float %b, i64 0
Expand All @@ -256,8 +225,7 @@ define <4 x float> @vslide1up_4xf32(<4 x float> %v, float %b) {
; CHECK-LABEL: vslide1up_4xf32:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
; CHECK-NEXT: vfmv.v.f v9, fa0
; CHECK-NEXT: vslideup.vi v9, v8, 1
; CHECK-NEXT: vfslide1up.vf v9, v8, fa0
; CHECK-NEXT: vmv.v.v v8, v9
; CHECK-NEXT: ret
%vb = insertelement <4 x float> poison, float %b, i64 0
Expand All @@ -269,8 +237,7 @@ define <2 x double> @vslide1up_2xf64(<2 x double> %v, double %b) {
; CHECK-LABEL: vslide1up_2xf64:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma
; CHECK-NEXT: vfmv.v.f v9, fa0
; CHECK-NEXT: vslideup.vi v9, v8, 1
; CHECK-NEXT: vfslide1up.vf v9, v8, fa0
; CHECK-NEXT: vmv.v.v v8, v9
; CHECK-NEXT: ret
%vb = insertelement <2 x double> poison, double %b, i64 0
Expand All @@ -291,6 +258,24 @@ define <4 x double> @vslide1up_4xf64(<4 x double> %v, double %b) {
ret <4 x double> %v1
}

define <4 x i8> @vslide1up_4xi8_with_splat(<4 x i8> %v, i8 %b) {
; CHECK-LABEL: vslide1up_4xi8_with_splat:
; CHECK: # %bb.0:
; CHECK-NEXT: li a1, 14
; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, mu
; CHECK-NEXT: vmv.s.x v0, a1
; CHECK-NEXT: vid.v v9
; CHECK-NEXT: vadd.vi v10, v9, -1
; CHECK-NEXT: vmv.v.x v9, a0
; CHECK-NEXT: vrgather.vv v9, v8, v10, v0.t
; CHECK-NEXT: vmv1r.v v8, v9
; CHECK-NEXT: ret
%vb = insertelement <4 x i8> poison, i8 %b, i64 0
%v1 = shufflevector <4 x i8> %vb, <4 x i8> poison, <4 x i32> zeroinitializer
%v2 = shufflevector <4 x i8> %v1, <4 x i8> %v, <4 x i32> <i32 1, i32 4, i32 5, i32 6>
ret <4 x i8> %v2
}

define <2 x double> @vslide1up_v2f64_inverted(<2 x double> %v, double %b) {
; CHECK-LABEL: vslide1up_v2f64_inverted:
; CHECK: # %bb.0:
Expand Down Expand Up @@ -320,7 +305,8 @@ define <4 x i8> @vslide1up_4xi8_inverted(<4 x i8> %v, i8 %b) {
}


; The length of the shift is less than the suffix
; The length of the shift is less than the suffix, since we'd have to
; materailize the splat, using the vslide1up doesn't help us.
define <4 x i32> @vslide1up_4xi32_neg1(<4 x i32> %v, i32 %b) {
; CHECK-LABEL: vslide1up_4xi32_neg1:
; CHECK: # %bb.0:
Expand All @@ -335,3 +321,15 @@ define <4 x i32> @vslide1up_4xi32_neg1(<4 x i32> %v, i32 %b) {
%v1 = shufflevector <4 x i32> %v, <4 x i32> %vb2, <4 x i32> <i32 4, i32 0, i32 1, i32 7>
ret <4 x i32> %v1
}

; We don't know the scalar to do the vslide1up
define <4 x i32> @vslide1up_4xi32_neg2(<4 x i32> %v1, <4 x i32> %v2) {
; CHECK-LABEL: vslide1up_4xi32_neg2:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
; CHECK-NEXT: vslideup.vi v9, v8, 1
; CHECK-NEXT: vmv.v.v v8, v9
; CHECK-NEXT: ret
%res = shufflevector <4 x i32> %v1, <4 x i32> %v2, <4 x i32> <i32 4, i32 0, i32 1, i32 2>
ret <4 x i32> %res
}

0 comments on commit 544a240

Please sign in to comment.