Skip to content

Commit

Permalink
Revert "[RISCV] Recurse on first operand of two operand shuffles (#79180
Browse files Browse the repository at this point in the history
)" (#80238)

This reverts commit bdc4110 on the
release/18.x branch.  This change was the first in a mini-series
and while I'm not aware of any particular problem from having it on
it's own in the branch, it seems safer to ship with the previous
known good state.
  • Loading branch information
preames committed Feb 16, 2024
1 parent ab57f6c commit 325d4a1
Show file tree
Hide file tree
Showing 6 changed files with 407 additions and 347 deletions.
92 changes: 44 additions & 48 deletions llvm/lib/Target/RISCV/RISCVISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5033,60 +5033,56 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
MVT IndexContainerVT =
ContainerVT.changeVectorElementType(IndexVT.getScalarType());

// Base case for the recursion just below - handle the worst case
// single source permutation. Note that all the splat variants
// are handled above.
if (V2.isUndef()) {
SDValue Gather;
// TODO: This doesn't trigger for i64 vectors on RV32, since there we
// encounter a bitcasted BUILD_VECTOR with low/high i32 values.
if (SDValue SplatValue = DAG.getSplatValue(V1, /*LegalTypes*/ true)) {
Gather = lowerScalarSplat(SDValue(), SplatValue, VL, ContainerVT, DL, DAG,
Subtarget);
} else {
V1 = convertToScalableVector(ContainerVT, V1, DAG, Subtarget);
SDValue LHSIndices = DAG.getBuildVector(IndexVT, DL, GatherIndicesLHS);
LHSIndices = convertToScalableVector(IndexContainerVT, LHSIndices, DAG,
Subtarget);
SDValue Gather = DAG.getNode(GatherVVOpc, DL, ContainerVT, V1, LHSIndices,
DAG.getUNDEF(ContainerVT), TrueMask, VL);
return convertFromScalableVector(VT, Gather, DAG, Subtarget);
}

// Translate the gather index we computed above (and possibly swapped)
// back to a shuffle mask. This step should disappear once we complete
// the migration to recursive design.
SmallVector<int> ShuffleMaskLHS;
ShuffleMaskLHS.reserve(GatherIndicesLHS.size());
for (SDValue GatherIndex : GatherIndicesLHS) {
if (GatherIndex.isUndef()) {
ShuffleMaskLHS.push_back(-1);
continue;
// If only one index is used, we can use a "splat" vrgather.
// TODO: We can splat the most-common index and fix-up any stragglers, if
// that's beneficial.
if (LHSIndexCounts.size() == 1) {
int SplatIndex = LHSIndexCounts.begin()->getFirst();
Gather = DAG.getNode(GatherVXOpc, DL, ContainerVT, V1,
DAG.getConstant(SplatIndex, DL, XLenVT),
DAG.getUNDEF(ContainerVT), TrueMask, VL);
} else {
SDValue LHSIndices = DAG.getBuildVector(IndexVT, DL, GatherIndicesLHS);
LHSIndices =
convertToScalableVector(IndexContainerVT, LHSIndices, DAG, Subtarget);

Gather = DAG.getNode(GatherVVOpc, DL, ContainerVT, V1, LHSIndices,
DAG.getUNDEF(ContainerVT), TrueMask, VL);
}
auto *IdxC = cast<ConstantSDNode>(GatherIndex);
ShuffleMaskLHS.push_back(IdxC->getZExtValue());
}

// Recursively invoke lowering for the LHS as if there were no RHS.
// This allows us to leverage all of our single source permute tricks.
SDValue Gather =
DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), ShuffleMaskLHS);
Gather = convertToScalableVector(ContainerVT, Gather, DAG, Subtarget);
// If a second vector operand is used by this shuffle, blend it in with an
// additional vrgather.
if (!V2.isUndef()) {
V2 = convertToScalableVector(ContainerVT, V2, DAG, Subtarget);

// Blend in second vector source with an additional vrgather.
V2 = convertToScalableVector(ContainerVT, V2, DAG, Subtarget);
MVT MaskContainerVT = ContainerVT.changeVectorElementType(MVT::i1);
SelectMask =
convertToScalableVector(MaskContainerVT, SelectMask, DAG, Subtarget);

MVT MaskContainerVT = ContainerVT.changeVectorElementType(MVT::i1);
SelectMask =
convertToScalableVector(MaskContainerVT, SelectMask, DAG, Subtarget);

// If only one index is used, we can use a "splat" vrgather.
// TODO: We can splat the most-common index and fix-up any stragglers, if
// that's beneficial.
if (RHSIndexCounts.size() == 1) {
int SplatIndex = RHSIndexCounts.begin()->getFirst();
Gather = DAG.getNode(GatherVXOpc, DL, ContainerVT, V2,
DAG.getConstant(SplatIndex, DL, XLenVT), Gather,
SelectMask, VL);
} else {
SDValue RHSIndices = DAG.getBuildVector(IndexVT, DL, GatherIndicesRHS);
RHSIndices =
convertToScalableVector(IndexContainerVT, RHSIndices, DAG, Subtarget);
Gather = DAG.getNode(GatherVVOpc, DL, ContainerVT, V2, RHSIndices, Gather,
SelectMask, VL);
// If only one index is used, we can use a "splat" vrgather.
// TODO: We can splat the most-common index and fix-up any stragglers, if
// that's beneficial.
if (RHSIndexCounts.size() == 1) {
int SplatIndex = RHSIndexCounts.begin()->getFirst();
Gather = DAG.getNode(GatherVXOpc, DL, ContainerVT, V2,
DAG.getConstant(SplatIndex, DL, XLenVT), Gather,
SelectMask, VL);
} else {
SDValue RHSIndices = DAG.getBuildVector(IndexVT, DL, GatherIndicesRHS);
RHSIndices =
convertToScalableVector(IndexContainerVT, RHSIndices, DAG, Subtarget);
Gather = DAG.getNode(GatherVVOpc, DL, ContainerVT, V2, RHSIndices, Gather,
SelectMask, VL);
}
}

return convertFromScalableVector(VT, Gather, DAG, Subtarget);
Expand Down
41 changes: 27 additions & 14 deletions llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll
Original file line number Diff line number Diff line change
Expand Up @@ -238,26 +238,39 @@ define <64 x half> @interleave_v32f16(<32 x half> %x, <32 x half> %y) {
define <64 x float> @interleave_v32f32(<32 x float> %x, <32 x float> %y) {
; V128-LABEL: interleave_v32f32:
; V128: # %bb.0:
; V128-NEXT: vsetivli zero, 16, e32, m8, ta, ma
; V128-NEXT: vslidedown.vi v0, v8, 16
; V128-NEXT: vsetivli zero, 16, e32, m4, ta, ma
; V128-NEXT: vwaddu.vv v24, v0, v8
; V128-NEXT: li a0, -1
; V128-NEXT: vwmaccu.vx v24, a0, v8
; V128-NEXT: lui a1, %hi(.LCPI10_0)
; V128-NEXT: addi a1, a1, %lo(.LCPI10_0)
; V128-NEXT: li a2, 32
; V128-NEXT: vsetvli zero, a2, e32, m8, ta, mu
; V128-NEXT: vle16.v v12, (a1)
; V128-NEXT: lui a1, 699051
; V128-NEXT: addi a1, a1, -1366
; V128-NEXT: vmv.s.x v0, a1
; V128-NEXT: addi sp, sp, -16
; V128-NEXT: .cfi_def_cfa_offset 16
; V128-NEXT: csrr a0, vlenb
; V128-NEXT: slli a0, a0, 2
; V128-NEXT: sub sp, sp, a0
; V128-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb
; V128-NEXT: lui a0, %hi(.LCPI10_0)
; V128-NEXT: addi a0, a0, %lo(.LCPI10_0)
; V128-NEXT: li a1, 32
; V128-NEXT: vsetvli zero, a1, e32, m8, ta, mu
; V128-NEXT: vle16.v v4, (a0)
; V128-NEXT: lui a0, %hi(.LCPI10_1)
; V128-NEXT: addi a0, a0, %lo(.LCPI10_1)
; V128-NEXT: vle16.v v24, (a0)
; V128-NEXT: addi a0, sp, 16
; V128-NEXT: vs4r.v v24, (a0) # Unknown-size Folded Spill
; V128-NEXT: lui a0, 699051
; V128-NEXT: addi a0, a0, -1366
; V128-NEXT: vmv.s.x v0, a0
; V128-NEXT: vrgatherei16.vv v24, v8, v4
; V128-NEXT: addi a0, sp, 16
; V128-NEXT: vl4r.v v12, (a0) # Unknown-size Folded Reload
; V128-NEXT: vrgatherei16.vv v24, v16, v12, v0.t
; V128-NEXT: vsetivli zero, 16, e32, m4, ta, ma
; V128-NEXT: vwaddu.vv v0, v8, v16
; V128-NEXT: li a0, -1
; V128-NEXT: vwmaccu.vx v0, a0, v16
; V128-NEXT: vmv8r.v v8, v0
; V128-NEXT: vmv8r.v v16, v24
; V128-NEXT: csrr a0, vlenb
; V128-NEXT: slli a0, a0, 2
; V128-NEXT: add sp, sp, a0
; V128-NEXT: addi sp, sp, 16
; V128-NEXT: ret
;
; V512-LABEL: interleave_v32f32:
Expand Down
63 changes: 35 additions & 28 deletions llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll
Original file line number Diff line number Diff line change
Expand Up @@ -188,30 +188,24 @@ define <4 x i32> @interleave_v4i32_offset_2(<4 x i32> %x, <4 x i32> %y) {
define <4 x i32> @interleave_v4i32_offset_1(<4 x i32> %x, <4 x i32> %y) {
; V128-LABEL: interleave_v4i32_offset_1:
; V128: # %bb.0:
; V128-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
; V128-NEXT: vwaddu.vv v10, v8, v8
; V128-NEXT: li a0, -1
; V128-NEXT: vwmaccu.vx v10, a0, v8
; V128-NEXT: vsetivli zero, 4, e32, m1, ta, mu
; V128-NEXT: vid.v v8
; V128-NEXT: vsrl.vi v8, v8, 1
; V128-NEXT: vid.v v10
; V128-NEXT: vsrl.vi v11, v10, 1
; V128-NEXT: vrgather.vv v10, v8, v11
; V128-NEXT: vmv.v.i v0, 10
; V128-NEXT: vadd.vi v8, v8, 1
; V128-NEXT: vadd.vi v8, v11, 1
; V128-NEXT: vrgather.vv v10, v9, v8, v0.t
; V128-NEXT: vmv.v.v v8, v10
; V128-NEXT: ret
;
; V512-LABEL: interleave_v4i32_offset_1:
; V512: # %bb.0:
; V512-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
; V512-NEXT: vwaddu.vv v10, v8, v8
; V512-NEXT: li a0, -1
; V512-NEXT: vwmaccu.vx v10, a0, v8
; V512-NEXT: vsetivli zero, 4, e32, mf2, ta, mu
; V512-NEXT: vid.v v8
; V512-NEXT: vsrl.vi v8, v8, 1
; V512-NEXT: vid.v v10
; V512-NEXT: vsrl.vi v11, v10, 1
; V512-NEXT: vrgather.vv v10, v8, v11
; V512-NEXT: vmv.v.i v0, 10
; V512-NEXT: vadd.vi v8, v8, 1
; V512-NEXT: vadd.vi v8, v11, 1
; V512-NEXT: vrgather.vv v10, v9, v8, v0.t
; V512-NEXT: vmv1r.v v8, v10
; V512-NEXT: ret
Expand Down Expand Up @@ -403,26 +397,39 @@ define <64 x i16> @interleave_v32i16(<32 x i16> %x, <32 x i16> %y) {
define <64 x i32> @interleave_v32i32(<32 x i32> %x, <32 x i32> %y) {
; V128-LABEL: interleave_v32i32:
; V128: # %bb.0:
; V128-NEXT: vsetivli zero, 16, e32, m8, ta, ma
; V128-NEXT: vslidedown.vi v0, v8, 16
; V128-NEXT: vsetivli zero, 16, e32, m4, ta, ma
; V128-NEXT: vwaddu.vv v24, v0, v8
; V128-NEXT: li a0, -1
; V128-NEXT: vwmaccu.vx v24, a0, v8
; V128-NEXT: lui a1, %hi(.LCPI17_0)
; V128-NEXT: addi a1, a1, %lo(.LCPI17_0)
; V128-NEXT: li a2, 32
; V128-NEXT: vsetvli zero, a2, e32, m8, ta, mu
; V128-NEXT: vle16.v v12, (a1)
; V128-NEXT: lui a1, 699051
; V128-NEXT: addi a1, a1, -1366
; V128-NEXT: vmv.s.x v0, a1
; V128-NEXT: addi sp, sp, -16
; V128-NEXT: .cfi_def_cfa_offset 16
; V128-NEXT: csrr a0, vlenb
; V128-NEXT: slli a0, a0, 2
; V128-NEXT: sub sp, sp, a0
; V128-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb
; V128-NEXT: lui a0, %hi(.LCPI17_0)
; V128-NEXT: addi a0, a0, %lo(.LCPI17_0)
; V128-NEXT: li a1, 32
; V128-NEXT: vsetvli zero, a1, e32, m8, ta, mu
; V128-NEXT: vle16.v v4, (a0)
; V128-NEXT: lui a0, %hi(.LCPI17_1)
; V128-NEXT: addi a0, a0, %lo(.LCPI17_1)
; V128-NEXT: vle16.v v24, (a0)
; V128-NEXT: addi a0, sp, 16
; V128-NEXT: vs4r.v v24, (a0) # Unknown-size Folded Spill
; V128-NEXT: lui a0, 699051
; V128-NEXT: addi a0, a0, -1366
; V128-NEXT: vmv.s.x v0, a0
; V128-NEXT: vrgatherei16.vv v24, v8, v4
; V128-NEXT: addi a0, sp, 16
; V128-NEXT: vl4r.v v12, (a0) # Unknown-size Folded Reload
; V128-NEXT: vrgatherei16.vv v24, v16, v12, v0.t
; V128-NEXT: vsetivli zero, 16, e32, m4, ta, ma
; V128-NEXT: vwaddu.vv v0, v8, v16
; V128-NEXT: li a0, -1
; V128-NEXT: vwmaccu.vx v0, a0, v16
; V128-NEXT: vmv8r.v v8, v0
; V128-NEXT: vmv8r.v v16, v24
; V128-NEXT: csrr a0, vlenb
; V128-NEXT: slli a0, a0, 2
; V128-NEXT: add sp, sp, a0
; V128-NEXT: addi sp, sp, 16
; V128-NEXT: ret
;
; V512-LABEL: interleave_v32i32:
Expand Down
43 changes: 27 additions & 16 deletions llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll
Original file line number Diff line number Diff line change
Expand Up @@ -612,11 +612,13 @@ define <8 x i8> @concat_4xi8_start_undef_at_start(<8 x i8> %v, <8 x i8> %w) {
; CHECK-LABEL: concat_4xi8_start_undef_at_start:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
; CHECK-NEXT: vid.v v10
; CHECK-NEXT: vid.v v11
; CHECK-NEXT: vrgather.vv v10, v8, v11
; CHECK-NEXT: li a0, 224
; CHECK-NEXT: vmv.s.x v0, a0
; CHECK-NEXT: vadd.vi v10, v10, -4
; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t
; CHECK-NEXT: vadd.vi v8, v11, -4
; CHECK-NEXT: vrgather.vv v10, v9, v8, v0.t
; CHECK-NEXT: vmv1r.v v8, v10
; CHECK-NEXT: ret
%res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 9, i32 10, i32 11>
ret <8 x i8> %res
Expand All @@ -626,11 +628,13 @@ define <8 x i8> @merge_start_into_end_non_contiguous(<8 x i8> %v, <8 x i8> %w) {
; CHECK-LABEL: merge_start_into_end_non_contiguous:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
; CHECK-NEXT: vid.v v10
; CHECK-NEXT: vid.v v11
; CHECK-NEXT: vrgather.vv v10, v8, v11
; CHECK-NEXT: li a0, 144
; CHECK-NEXT: vmv.s.x v0, a0
; CHECK-NEXT: vadd.vi v10, v10, -4
; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t
; CHECK-NEXT: vadd.vi v8, v11, -4
; CHECK-NEXT: vrgather.vv v10, v9, v8, v0.t
; CHECK-NEXT: vmv1r.v v8, v10
; CHECK-NEXT: ret
%res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 5, i32 6, i32 11>
ret <8 x i8> %res
Expand Down Expand Up @@ -671,11 +675,13 @@ define <8 x i8> @merge_slidedown(<8 x i8> %v, <8 x i8> %w) {
; CHECK-LABEL: merge_slidedown:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
; CHECK-NEXT: vslidedown.vi v8, v8, 1
; CHECK-NEXT: vid.v v11
; CHECK-NEXT: vadd.vi v12, v11, 1
; CHECK-NEXT: li a0, 195
; CHECK-NEXT: vmv.s.x v0, a0
; CHECK-NEXT: vid.v v10
; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t
; CHECK-NEXT: vrgather.vv v10, v8, v12
; CHECK-NEXT: vrgather.vv v10, v9, v11, v0.t
; CHECK-NEXT: vmv1r.v v8, v10
; CHECK-NEXT: ret
%res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> <i32 8, i32 9, i32 3, i32 4, i32 5, i32 6, i32 14, i32 15>
ret <8 x i8> %res
Expand All @@ -686,12 +692,14 @@ define <8 x i8> @merge_non_contiguous_slideup_slidedown(<8 x i8> %v, <8 x i8> %w
; CHECK-LABEL: merge_non_contiguous_slideup_slidedown:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
; CHECK-NEXT: vid.v v10
; CHECK-NEXT: vadd.vi v10, v10, -1
; CHECK-NEXT: vid.v v11
; CHECK-NEXT: vadd.vi v12, v11, 2
; CHECK-NEXT: vrgather.vv v10, v8, v12
; CHECK-NEXT: li a0, 234
; CHECK-NEXT: vmv.s.x v0, a0
; CHECK-NEXT: vslidedown.vi v8, v8, 2
; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t
; CHECK-NEXT: vadd.vi v8, v11, -1
; CHECK-NEXT: vrgather.vv v10, v9, v8, v0.t
; CHECK-NEXT: vmv1r.v v8, v10
; CHECK-NEXT: ret
%res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> <i32 2, i32 8, i32 4, i32 10, i32 6, i32 12, i32 13, i32 14>
ret <8 x i8> %res
Expand All @@ -702,13 +710,16 @@ define <8 x i8> @unmergable(<8 x i8> %v, <8 x i8> %w) {
; CHECK-LABEL: unmergable:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
; CHECK-NEXT: vid.v v10
; CHECK-NEXT: vadd.vi v11, v10, 2
; CHECK-NEXT: lui a0, %hi(.LCPI46_0)
; CHECK-NEXT: addi a0, a0, %lo(.LCPI46_0)
; CHECK-NEXT: vle8.v v10, (a0)
; CHECK-NEXT: vle8.v v12, (a0)
; CHECK-NEXT: li a0, 234
; CHECK-NEXT: vmv.s.x v0, a0
; CHECK-NEXT: vslidedown.vi v8, v8, 2
; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t
; CHECK-NEXT: vrgather.vv v10, v8, v11
; CHECK-NEXT: vrgather.vv v10, v9, v12, v0.t
; CHECK-NEXT: vmv1r.v v8, v10
; CHECK-NEXT: ret
%res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> <i32 2, i32 9, i32 4, i32 11, i32 6, i32 13, i32 8, i32 15>
ret <8 x i8> %res
Expand Down

0 comments on commit 325d4a1

Please sign in to comment.